nanoarrow/0000755000176200001440000000000014557000642012263 5ustar liggesusersnanoarrow/NAMESPACE0000644000176200001440000001512714547061553013517 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method("$",nanoarrow_array) S3method("$",nanoarrow_array_stream) S3method("$",nanoarrow_buffer) S3method("$",nanoarrow_schema) S3method("$<-",nanoarrow_array) S3method("$<-",nanoarrow_schema) S3method("[[",nanoarrow_array) S3method("[[",nanoarrow_array_stream) S3method("[[",nanoarrow_buffer) S3method("[[",nanoarrow_schema) S3method("[[<-",nanoarrow_array) S3method("[[<-",nanoarrow_schema) S3method(as.data.frame,nanoarrow_array) S3method(as.data.frame,nanoarrow_array_stream) S3method(as.raw,nanoarrow_buffer) S3method(as.vector,nanoarrow_array) S3method(as.vector,nanoarrow_array_stream) S3method(as.vector,nanoarrow_buffer) S3method(as_nanoarrow_array,Array) S3method(as_nanoarrow_array,ChunkedArray) S3method(as_nanoarrow_array,Date) S3method(as_nanoarrow_array,POSIXct) S3method(as_nanoarrow_array,POSIXlt) S3method(as_nanoarrow_array,RecordBatch) S3method(as_nanoarrow_array,Table) S3method(as_nanoarrow_array,blob) S3method(as_nanoarrow_array,data.frame) S3method(as_nanoarrow_array,default) S3method(as_nanoarrow_array,difftime) S3method(as_nanoarrow_array,factor) S3method(as_nanoarrow_array,integer64) S3method(as_nanoarrow_array,list) S3method(as_nanoarrow_array,nanoarrow_array) S3method(as_nanoarrow_array,nanoarrow_buffer) S3method(as_nanoarrow_array,vctrs_unspecified) S3method(as_nanoarrow_array_extension,default) S3method(as_nanoarrow_array_extension,nanoarrow_extension_spec_vctrs) S3method(as_nanoarrow_array_stream,Array) S3method(as_nanoarrow_array_stream,ArrowTabular) S3method(as_nanoarrow_array_stream,ChunkedArray) S3method(as_nanoarrow_array_stream,Dataset) S3method(as_nanoarrow_array_stream,RecordBatchReader) S3method(as_nanoarrow_array_stream,Scanner) S3method(as_nanoarrow_array_stream,arrow_dplyr_query) S3method(as_nanoarrow_array_stream,data.frame) S3method(as_nanoarrow_array_stream,default) S3method(as_nanoarrow_array_stream,nanoarrow_array) S3method(as_nanoarrow_array_stream,nanoarrow_array_stream) S3method(as_nanoarrow_buffer,default) S3method(as_nanoarrow_buffer,nanoarrow_buffer) S3method(as_nanoarrow_schema,DataType) S3method(as_nanoarrow_schema,Field) S3method(as_nanoarrow_schema,Schema) S3method(as_nanoarrow_schema,nanoarrow_schema) S3method(convert_array,default) S3method(convert_array,double) S3method(convert_array,factor) S3method(convert_array,vctrs_partial_frame) S3method(convert_array_extension,default) S3method(convert_array_extension,nanoarrow_extension_spec_vctrs) S3method(format,nanoarrow_array) S3method(format,nanoarrow_array_stream) S3method(format,nanoarrow_buffer) S3method(format,nanoarrow_schema) S3method(infer_nanoarrow_ptype_extension,default) S3method(infer_nanoarrow_ptype_extension,nanoarrow_extension_spec_vctrs) S3method(infer_nanoarrow_schema,Array) S3method(infer_nanoarrow_schema,ArrowTabular) S3method(infer_nanoarrow_schema,AsIs) S3method(infer_nanoarrow_schema,ChunkedArray) S3method(infer_nanoarrow_schema,Dataset) S3method(infer_nanoarrow_schema,Date) S3method(infer_nanoarrow_schema,Expression) S3method(infer_nanoarrow_schema,POSIXct) S3method(infer_nanoarrow_schema,POSIXlt) S3method(infer_nanoarrow_schema,RecordBatchReader) S3method(infer_nanoarrow_schema,Scalar) S3method(infer_nanoarrow_schema,Scanner) S3method(infer_nanoarrow_schema,arrow_dplyr_query) S3method(infer_nanoarrow_schema,blob) S3method(infer_nanoarrow_schema,character) S3method(infer_nanoarrow_schema,data.frame) S3method(infer_nanoarrow_schema,default) S3method(infer_nanoarrow_schema,difftime) S3method(infer_nanoarrow_schema,double) S3method(infer_nanoarrow_schema,factor) S3method(infer_nanoarrow_schema,hms) S3method(infer_nanoarrow_schema,integer) S3method(infer_nanoarrow_schema,integer64) S3method(infer_nanoarrow_schema,list) S3method(infer_nanoarrow_schema,logical) S3method(infer_nanoarrow_schema,nanoarrow_array) S3method(infer_nanoarrow_schema,nanoarrow_array_stream) S3method(infer_nanoarrow_schema,raw) S3method(infer_nanoarrow_schema,vctrs_list_of) S3method(infer_nanoarrow_schema,vctrs_unspecified) S3method(length,nanoarrow_array) S3method(length,nanoarrow_array_stream) S3method(length,nanoarrow_buffer) S3method(length,nanoarrow_schema) S3method(names,nanoarrow_array) S3method(names,nanoarrow_array_stream) S3method(names,nanoarrow_buffer) S3method(names,nanoarrow_schema) S3method(print,nanoarrow_array) S3method(print,nanoarrow_array_stream) S3method(print,nanoarrow_buffer) S3method(print,nanoarrow_schema) S3method(str,nanoarrow_array) S3method(str,nanoarrow_array_stream) S3method(str,nanoarrow_buffer) S3method(str,nanoarrow_schema) export(array_stream_set_finalizer) export(as_nanoarrow_array) export(as_nanoarrow_array_extension) export(as_nanoarrow_array_stream) export(as_nanoarrow_buffer) export(as_nanoarrow_schema) export(basic_array_stream) export(collect_array_stream) export(convert_array) export(convert_array_extension) export(convert_array_stream) export(convert_buffer) export(infer_nanoarrow_ptype) export(infer_nanoarrow_ptype_extension) export(infer_nanoarrow_schema) export(na_binary) export(na_bool) export(na_date32) export(na_date64) export(na_decimal128) export(na_decimal256) export(na_dense_union) export(na_dictionary) export(na_double) export(na_duration) export(na_extension) export(na_fixed_size_binary) export(na_fixed_size_list) export(na_float) export(na_half_float) export(na_int16) export(na_int32) export(na_int64) export(na_int8) export(na_interval_day_time) export(na_interval_month_day_nano) export(na_interval_months) export(na_large_binary) export(na_large_list) export(na_large_string) export(na_list) export(na_map) export(na_na) export(na_sparse_union) export(na_string) export(na_struct) export(na_time32) export(na_time64) export(na_timestamp) export(na_type) export(na_uint16) export(na_uint32) export(na_uint64) export(na_uint8) export(na_vctrs) export(nanoarrow_allocate_array) export(nanoarrow_allocate_array_stream) export(nanoarrow_allocate_schema) export(nanoarrow_array_init) export(nanoarrow_array_modify) export(nanoarrow_array_set_schema) export(nanoarrow_buffer_append) export(nanoarrow_buffer_init) export(nanoarrow_extension_array) export(nanoarrow_extension_spec) export(nanoarrow_pointer_addr_chr) export(nanoarrow_pointer_addr_dbl) export(nanoarrow_pointer_addr_pretty) export(nanoarrow_pointer_export) export(nanoarrow_pointer_is_valid) export(nanoarrow_pointer_move) export(nanoarrow_pointer_release) export(nanoarrow_pointer_set_protected) export(nanoarrow_schema_modify) export(nanoarrow_schema_parse) export(nanoarrow_version) export(register_nanoarrow_extension) export(resolve_nanoarrow_extension) export(unregister_nanoarrow_extension) importFrom(utils,getFromNamespace) importFrom(utils,str) useDynLib(nanoarrow, .registration = TRUE) nanoarrow/tools/0000755000176200001440000000000014502402562013420 5ustar liggesusersnanoarrow/tools/make-callentries.R0000644000176200001440000000470314502402562016767 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # This file updates the call entries in src/init.c such that R code # can use .Call(nanoarrow_c_some_function_name) for all C functions # with the signature SEXP nanoarrow_c_some_function_name(...). library(tidyverse) src_files <- list.files("src", "\\.(c|cpp)$", full.names = TRUE) %>% setdiff("src/init.c") src_sources <- src_files %>% set_names() %>% map_chr(readr::read_file) defs <- tibble( def = src_sources %>% str_extract_all(regex("SEXP nanoarrow_c_[^\\)]+\\)", multiline = TRUE)) %>% unlist() %>% unique() %>% str_replace_all("\\s+", " ") %>% str_trim(), name = def %>% str_extract("nanoarrow_c_[^\\(]+"), return_type = "SEXP", args = def %>% str_remove("SEXP nanoarrow_c_[^\\(]+\\(") %>% str_remove("\\)$") %>% str_split("\\s*,\\s*") %>% map(~{if(identical(.x, "") || identical(.x, "void")) character(0) else .x}), n_args = map(args, length) ) call_headers <- paste0( "extern ", defs$def, ";", collapse = "\n" ) call_entries <- paste0( '{"', defs$name, '", (DL_FUNC)&', defs$name, ', ', defs$n_args, "},", collapse = "\n " ) header <- glue::glue(' /* generated by tools/make-callentries.R */ {call_headers} static const R_CallMethodDef CallEntries[] = {{ {call_entries} {{NULL, NULL, 0}}}}; /* end generated by tools/make-callentries.R */ ') # rewrite relevant portion of init.c init <- read_file("src/init.c") pattern <- regex( "\n/\\* generated by tools/make-callentries\\.R \\*/.*?/\\* end generated by tools/make-callentries\\.R \\*/", multiline = TRUE, dotall = TRUE ) stopifnot(str_detect(init, pattern)) init %>% str_replace(pattern, header) %>% write_file("src/init.c") system("clang-format -i src/init.c") nanoarrow/README.md0000644000176200001440000001730014556775536013567 0ustar liggesusers # nanoarrow The goal of nanoarrow is to provide minimal useful bindings to the [Arrow C Data](https://arrow.apache.org/docs/format/CDataInterface.html) and [Arrow C Stream](https://arrow.apache.org/docs/format/CStreamInterface.html) interfaces using the [nanoarrow C library](https://arrow.apache.org/nanoarrow/). ## Installation You can install the released version of nanoarrow from [CRAN](https://cran.r-project.org/) with: ``` r install.packages("nanoarrow") ``` You can install the development version of nanoarrow from [GitHub](https://github.com/) with: ``` r # install.packages("remotes") remotes::install_github("apache/arrow-nanoarrow/r") ``` If you can load the package, you’re good to go! ``` r library(nanoarrow) ``` ## Example The Arrow C Data and Arrow C Stream interfaces are comprised of three structures: the `ArrowSchema` which represents a data type of an array, the `ArrowArray` which represents the values of an array, and an `ArrowArrayStream`, which represents zero or more `ArrowArray`s with a common `ArrowSchema`. All three can be wrapped by R objects using the nanoarrow R package. ### Schemas Use `infer_nanoarrow_schema()` to get the ArrowSchema object that corresponds to a given R vector type; use `as_nanoarrow_schema()` to convert an object from some other data type representation (e.g., an arrow R package `DataType` like `arrow::int32()`); or use `na_XXX()` functions to construct them. ``` r infer_nanoarrow_schema(1:5) #> #> $ format : chr "i" #> $ name : chr "" #> $ metadata : list() #> $ flags : int 2 #> $ children : list() #> $ dictionary: NULL as_nanoarrow_schema(arrow::schema(col1 = arrow::float64())) #> #> $ format : chr "+s" #> $ name : chr "" #> $ metadata : list() #> $ flags : int 0 #> $ children :List of 1 #> ..$ col1: #> .. ..$ format : chr "g" #> .. ..$ name : chr "col1" #> .. ..$ metadata : list() #> .. ..$ flags : int 2 #> .. ..$ children : list() #> .. ..$ dictionary: NULL #> $ dictionary: NULL na_int64() #> #> $ format : chr "l" #> $ name : chr "" #> $ metadata : list() #> $ flags : int 2 #> $ children : list() #> $ dictionary: NULL ``` ### Arrays Use `as_nanoarrow_array()` to convert an object to an ArrowArray object: ``` r as_nanoarrow_array(1:5) #> #> $ length : int 5 #> $ null_count: int 0 #> $ offset : int 0 #> $ buffers :List of 2 #> ..$ :[0][0 b]> `` #> ..$ :[5][20 b]> `1 2 3 4 5` #> $ dictionary: NULL #> $ children : list() as_nanoarrow_array(data.frame(col1 = c(1.1, 2.2))) #> #> $ length : int 2 #> $ null_count: int 0 #> $ offset : int 0 #> $ buffers :List of 1 #> ..$ :[0][0 b]> `` #> $ children :List of 1 #> ..$ col1: #> .. ..$ length : int 2 #> .. ..$ null_count: int 0 #> .. ..$ offset : int 0 #> .. ..$ buffers :List of 2 #> .. .. ..$ :[0][0 b]> `` #> .. .. ..$ :[2][16 b]> `1.1 2.2` #> .. ..$ dictionary: NULL #> .. ..$ children : list() #> $ dictionary: NULL ``` You can use `as.vector()` or `as.data.frame()` to get the R representation of the object back: ``` r array <- as_nanoarrow_array(data.frame(col1 = c(1.1, 2.2))) as.data.frame(array) #> col1 #> 1 1.1 #> 2 2.2 ``` Even though at the C level the ArrowArray is distinct from the ArrowSchema, at the R level we attach a schema wherever possible. You can access the attached schema using `infer_nanoarrow_schema()`: ``` r infer_nanoarrow_schema(array) #> #> $ format : chr "+s" #> $ name : chr "" #> $ metadata : list() #> $ flags : int 0 #> $ children :List of 1 #> ..$ col1: #> .. ..$ format : chr "g" #> .. ..$ name : chr "col1" #> .. ..$ metadata : list() #> .. ..$ flags : int 2 #> .. ..$ children : list() #> .. ..$ dictionary: NULL #> $ dictionary: NULL ``` ### Array Streams The easiest way to create an ArrowArrayStream is from a list of arrays or objects that can be converted to an array using `as_nanoarrow_array()`: ``` r stream <- basic_array_stream( list( data.frame(col1 = c(1.1, 2.2)), data.frame(col1 = c(3.3, 4.4)) ) ) ``` You can pull batches from the stream using the `$get_next()` method. The last batch will return `NULL`. ``` r stream$get_next() #> #> $ length : int 2 #> $ null_count: int 0 #> $ offset : int 0 #> $ buffers :List of 1 #> ..$ :[0][0 b]> `` #> $ children :List of 1 #> ..$ col1: #> .. ..$ length : int 2 #> .. ..$ null_count: int 0 #> .. ..$ offset : int 0 #> .. ..$ buffers :List of 2 #> .. .. ..$ :[0][0 b]> `` #> .. .. ..$ :[2][16 b]> `1.1 2.2` #> .. ..$ dictionary: NULL #> .. ..$ children : list() #> $ dictionary: NULL stream$get_next() #> #> $ length : int 2 #> $ null_count: int 0 #> $ offset : int 0 #> $ buffers :List of 1 #> ..$ :[0][0 b]> `` #> $ children :List of 1 #> ..$ col1: #> .. ..$ length : int 2 #> .. ..$ null_count: int 0 #> .. ..$ offset : int 0 #> .. ..$ buffers :List of 2 #> .. .. ..$ :[0][0 b]> `` #> .. .. ..$ :[2][16 b]> `3.3 4.4` #> .. ..$ dictionary: NULL #> .. ..$ children : list() #> $ dictionary: NULL stream$get_next() #> NULL ``` You can pull all the batches into a `data.frame()` by calling `as.data.frame()` or `as.vector()`: ``` r stream <- basic_array_stream( list( data.frame(col1 = c(1.1, 2.2)), data.frame(col1 = c(3.3, 4.4)) ) ) as.data.frame(stream) #> col1 #> 1 1.1 #> 2 2.2 #> 3 3.3 #> 4 4.4 ``` After consuming a stream, you should call the release method as soon as you can. This lets the implementation of the stream release any resources (like open files) it may be holding in a more predictable way than waiting for the garbage collector to clean up the object. ## Integration with the arrow package The nanoarrow package implements `as_nanoarrow_schema()`, `as_nanoarrow_array()`, and `as_nanoarrow_array_stream()` for most arrow package types. Similarly, it implements `arrow::as_arrow_array()`, `arrow::as_record_batch()`, `arrow::as_arrow_table()`, `arrow::as_record_batch_reader()`, `arrow::infer_type()`, `arrow::as_data_type()`, and `arrow::as_schema()` for nanoarrow objects such that you can pass equivalent nanoarrow objects into many arrow functions and vice versa. nanoarrow/man/0000755000176200001440000000000014547575511013051 5ustar liggesusersnanoarrow/man/infer_nanoarrow_ptype_extension.Rd0000644000176200001440000000261514502402562022034 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/extension.R \name{infer_nanoarrow_ptype_extension} \alias{infer_nanoarrow_ptype_extension} \alias{convert_array_extension} \alias{as_nanoarrow_array_extension} \title{Implement Arrow extension types} \usage{ infer_nanoarrow_ptype_extension( extension_spec, x, ..., warn_unregistered = TRUE ) convert_array_extension( extension_spec, array, to, ..., warn_unregistered = TRUE ) as_nanoarrow_array_extension(extension_spec, x, ..., schema = NULL) } \arguments{ \item{extension_spec}{An extension specification inheriting from 'nanoarrow_extension_spec'.} \item{x, array, to, schema, ...}{Passed from \code{\link[=infer_nanoarrow_ptype]{infer_nanoarrow_ptype()}}, \code{\link[=convert_array]{convert_array()}}, \code{\link[=as_nanoarrow_array]{as_nanoarrow_array()}}, and/or \code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}.} \item{warn_unregistered}{Use \code{FALSE} to infer/convert based on the storage type without a warning.} } \value{ \itemize{ \item \code{infer_nanoarrow_ptype_extension()}: The R vector prototype to be used as the default conversion target. \item \code{convert_array_extension()}: An R vector of type \code{to}. \item \code{as_nanoarrow_array_extension()}: A \link[=as_nanoarrow_array]{nanoarrow_array} of type \code{schema}. } } \description{ Implement Arrow extension types } nanoarrow/man/nanoarrow_extension_array.Rd0000644000176200001440000000161214502402562020622 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/extension.R \name{nanoarrow_extension_array} \alias{nanoarrow_extension_array} \title{Create Arrow extension arrays} \usage{ nanoarrow_extension_array( storage_array, extension_name, extension_metadata = NULL ) } \arguments{ \item{storage_array}{A \link[=as_nanoarrow_array]{nanoarrow_array}.} \item{extension_name}{For \code{\link[=na_extension]{na_extension()}}, the extension name. This is typically namespaced separated by dots (e.g., arrow.r.vctrs).} \item{extension_metadata}{A string or raw vector defining extension metadata. Most Arrow extension types define extension metadata as a JSON object.} } \value{ A \link[=as_nanoarrow_array]{nanoarrow_array} with attached extension schema. } \description{ Create Arrow extension arrays } \examples{ nanoarrow_extension_array(1:10, "some_ext", '{"key": "value"}') } nanoarrow/man/convert_array.Rd0000644000176200001440000000765614502402562016216 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/convert-array.R \name{convert_array} \alias{convert_array} \title{Convert an Array into an R vector} \usage{ convert_array(array, to = NULL, ...) } \arguments{ \item{array}{A \link[=as_nanoarrow_array]{nanoarrow_array}.} \item{to}{A target prototype object describing the type to which \code{array} should be converted, or \code{NULL} to use the default conversion as returned by \code{\link[=infer_nanoarrow_ptype]{infer_nanoarrow_ptype()}}. Alternatively, a function can be passed to perform an alternative calculation of the default ptype as a function of \code{array} and the default inference of the prototype.} \item{...}{Passed to S3 methods} } \value{ An R vector of type \code{to}. } \description{ Converts \code{array} to the type specified by \code{to}. This is a low-level interface; most users should use \code{as.data.frame()} or \code{as.vector()} unless finer-grained control is needed over the conversion. This function is an S3 generic dispatching on \code{to}: developers may implement their own S3 methods for custom vector types. } \details{ Conversions are implemented for the following R vector types: \itemize{ \item \code{\link[=logical]{logical()}}: Any numeric type can be converted to \code{\link[=logical]{logical()}} in addition to the bool type. For numeric types, any non-zero value is considered \code{TRUE}. \item \code{\link[=integer]{integer()}}: Any numeric type can be converted to \code{\link[=integer]{integer()}}; however, a warning will be signaled if the any value is outside the range of the 32-bit integer. \item \code{\link[=double]{double()}}: Any numeric type can be converted to \code{\link[=double]{double()}}. This conversion currently does not warn for values that may not roundtrip through a floating-point double (e.g., very large uint64 and int64 values). \item \code{\link[=character]{character()}}: String and large string types can be converted to \code{\link[=character]{character()}}. The conversion does not check for valid UTF-8: if you need finer-grained control over encodings, use \code{to = blob::blob()}. \item \code{\link[=factor]{factor()}}: Dictionary-encoded arrays of strings can be converted to \code{factor()}; however, this must be specified explicitly (i.e., \code{convert_array(array, factor())}) because arrays arriving in chunks can have dictionaries that contain different levels. Use \code{convert_array(array, factor(levels = c(...)))} to materialize an array into a vector with known levels. \item \link[=as.Date]{Date}: Only the date32 type can be converted to an R Date vector. \item \code{\link[hms:hms]{hms::hms()}}: Time32 and time64 types can be converted to \code{\link[hms:hms]{hms::hms()}}. \item \code{\link[=difftime]{difftime()}}: Time32, time64, and duration types can be converted to R \code{\link[=difftime]{difftime()}} vectors. The value is converted to match the \code{\link[=units]{units()}} attribute of \code{to}. \item \code{\link[blob:blob]{blob::blob()}}: String, large string, binary, and large binary types can be converted to \code{\link[blob:blob]{blob::blob()}}. \item \code{\link[vctrs:list_of]{vctrs::list_of()}}: List, large list, and fixed-size list types can be converted to \code{\link[vctrs:list_of]{vctrs::list_of()}}. \item \code{\link[=data.frame]{data.frame()}}: Struct types can be converted to \code{\link[=data.frame]{data.frame()}}. \item \code{\link[vctrs:unspecified]{vctrs::unspecified()}}: Any type can be converted to \code{\link[vctrs:unspecified]{vctrs::unspecified()}}; however, a warning will be raised if any non-null values are encountered. } In addition to the above conversions, a null array may be converted to any target prototype except \code{\link[=data.frame]{data.frame()}}. Extension arrays are currently converted as their storage type. } \examples{ array <- as_nanoarrow_array(data.frame(x = 1:5)) str(convert_array(array)) str(convert_array(array, to = data.frame(x = double()))) } nanoarrow/man/as_nanoarrow_buffer.Rd0000644000176200001440000000114714502402506017345 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/buffer.R \name{as_nanoarrow_buffer} \alias{as_nanoarrow_buffer} \title{Convert an object to a nanoarrow buffer} \usage{ as_nanoarrow_buffer(x, ...) } \arguments{ \item{x}{An object to convert to a buffer} \item{...}{Passed to S3 methods} } \value{ An object of class 'nanoarrow_buffer' } \description{ Convert an object to a nanoarrow buffer } \examples{ array <- as_nanoarrow_array(c(NA, 1:4)) array$buffers as.raw(array$buffers[[1]]) as.raw(array$buffers[[2]]) convert_buffer(array$buffers[[1]]) convert_buffer(array$buffers[[2]]) } nanoarrow/man/basic_array_stream.Rd0000644000176200001440000000165114355103326017161 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/array-stream.R \name{basic_array_stream} \alias{basic_array_stream} \title{Create ArrayStreams from batches} \usage{ basic_array_stream(batches, schema = NULL, validate = TRUE) } \arguments{ \item{batches}{A \code{\link[=list]{list()}} of \link[=as_nanoarrow_array]{nanoarrow_array} objects or objects that can be coerced via \code{\link[=as_nanoarrow_array]{as_nanoarrow_array()}}.} \item{schema}{A \link[=as_nanoarrow_schema]{nanoarrow_schema} or \code{NULL} to guess based on the first schema.} \item{validate}{Use \code{FALSE} to skip the validation step (i.e., if you know that the arrays are valid).} } \value{ An \link[=as_nanoarrow_array_stream]{nanoarrow_array_stream} } \description{ Create ArrayStreams from batches } \examples{ (stream <- basic_array_stream(list(data.frame(a = 1, b = 2)))) as.data.frame(stream$get_next()) stream$get_next() } nanoarrow/man/na_type.Rd0000644000176200001440000001277514377444470015013 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/type.R \name{na_type} \alias{na_type} \alias{na_na} \alias{na_bool} \alias{na_int8} \alias{na_uint8} \alias{na_int16} \alias{na_uint16} \alias{na_int32} \alias{na_uint32} \alias{na_int64} \alias{na_uint64} \alias{na_half_float} \alias{na_float} \alias{na_double} \alias{na_string} \alias{na_large_string} \alias{na_binary} \alias{na_large_binary} \alias{na_fixed_size_binary} \alias{na_date32} \alias{na_date64} \alias{na_time32} \alias{na_time64} \alias{na_duration} \alias{na_interval_months} \alias{na_interval_day_time} \alias{na_interval_month_day_nano} \alias{na_timestamp} \alias{na_decimal128} \alias{na_decimal256} \alias{na_struct} \alias{na_sparse_union} \alias{na_dense_union} \alias{na_list} \alias{na_large_list} \alias{na_fixed_size_list} \alias{na_map} \alias{na_dictionary} \alias{na_extension} \title{Create type objects} \usage{ na_type( type_name, byte_width = NULL, unit = NULL, timezone = NULL, column_types = NULL, item_type = NULL, key_type = NULL, value_type = NULL, index_type = NULL, ordered = NULL, list_size = NULL, keys_sorted = NULL, storage_type = NULL, extension_name = NULL, extension_metadata = NULL, nullable = NULL ) na_na(nullable = TRUE) na_bool(nullable = TRUE) na_int8(nullable = TRUE) na_uint8(nullable = TRUE) na_int16(nullable = TRUE) na_uint16(nullable = TRUE) na_int32(nullable = TRUE) na_uint32(nullable = TRUE) na_int64(nullable = TRUE) na_uint64(nullable = TRUE) na_half_float(nullable = TRUE) na_float(nullable = TRUE) na_double(nullable = TRUE) na_string(nullable = TRUE) na_large_string(nullable = TRUE) na_binary(nullable = TRUE) na_large_binary(nullable = TRUE) na_fixed_size_binary(byte_width, nullable = TRUE) na_date32(nullable = TRUE) na_date64(nullable = TRUE) na_time32(unit = c("ms", "s"), nullable = TRUE) na_time64(unit = c("us", "ns"), nullable = TRUE) na_duration(unit = c("ms", "s", "us", "ns"), nullable = TRUE) na_interval_months(nullable = TRUE) na_interval_day_time(nullable = TRUE) na_interval_month_day_nano(nullable = TRUE) na_timestamp(unit = c("us", "ns", "s", "ms"), timezone = "", nullable = TRUE) na_decimal128(precision, scale, nullable = TRUE) na_decimal256(precision, scale, nullable = TRUE) na_struct(column_types = list(), nullable = FALSE) na_sparse_union(column_types = list()) na_dense_union(column_types = list()) na_list(item_type, nullable = TRUE) na_large_list(item_type, nullable = TRUE) na_fixed_size_list(item_type, list_size, nullable = TRUE) na_map(key_type, item_type, keys_sorted = FALSE, nullable = TRUE) na_dictionary(value_type, index_type = na_int32(), ordered = FALSE) na_extension(storage_type, extension_name, extension_metadata = "") } \arguments{ \item{type_name}{The name of the type (e.g., "int32"). This form of the constructor is useful for writing tests that loop over many types.} \item{byte_width}{For \code{\link[=na_fixed_size_binary]{na_fixed_size_binary()}}, the number of bytes occupied by each item.} \item{unit}{One of 's' (seconds), 'ms' (milliseconds), 'us' (microseconds), or 'ns' (nanoseconds).} \item{timezone}{A string representing a timezone name. The empty string "" represents a naive point in time (i.e., one that has no associated timezone).} \item{column_types}{A \code{list()} of \link[=as_nanoarrow_schema]{nanoarrow_schema}s.} \item{item_type}{For \code{\link[=na_list]{na_list()}}, \code{\link[=na_large_list]{na_large_list()}}, \code{\link[=na_fixed_size_list]{na_fixed_size_list()}}, and \code{\link[=na_map]{na_map()}}, the \link[=as_nanoarrow_schema]{nanoarrow_schema} representing the item type.} \item{key_type}{The \link[=as_nanoarrow_schema]{nanoarrow_schema} representing the \code{\link[=na_map]{na_map()}} key type.} \item{value_type}{The \link[=as_nanoarrow_schema]{nanoarrow_schema} representing the \code{\link[=na_dictionary]{na_dictionary()}} or \code{\link[=na_map]{na_map()}} value type.} \item{index_type}{The \link[=as_nanoarrow_schema]{nanoarrow_schema} representing the \code{\link[=na_dictionary]{na_dictionary()}} index type.} \item{ordered}{Use \code{TRUE} to assert that the order of values in the dictionary are meaningful.} \item{list_size}{The number of elements in each item in a \code{\link[=na_fixed_size_list]{na_fixed_size_list()}}.} \item{keys_sorted}{Use \code{TRUE} to assert that keys are sorted.} \item{storage_type}{For \code{\link[=na_extension]{na_extension()}}, the underlying value type.} \item{extension_name}{For \code{\link[=na_extension]{na_extension()}}, the extension name. This is typically namespaced separated by dots (e.g., arrow.r.vctrs).} \item{extension_metadata}{A string or raw vector defining extension metadata. Most Arrow extension types define extension metadata as a JSON object.} \item{nullable}{Use \code{FALSE} to assert that this field cannot contain null values.} \item{precision}{The total number of digits representable by the decimal type} \item{scale}{The number of digits after the decimal point in a decimal type} } \value{ A \link[=as_nanoarrow_schema]{nanoarrow_schema} } \description{ In nanoarow, types, fields, and schemas are all represented by a \link[=as_nanoarrow_schema]{nanoarrow_schema}. These functions are convenience constructors to create these objects in a readable way. Use \code{\link[=na_type]{na_type()}} to construct types based on the constructor name, which is also the name that prints/is returned by \code{\link[=nanoarrow_schema_parse]{nanoarrow_schema_parse()}}. } \examples{ na_int32() na_struct(list(col1 = na_int32())) } nanoarrow/man/as_nanoarrow_array_stream.Rd0000644000176200001440000000262614315437207020600 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/array-stream.R \name{as_nanoarrow_array_stream} \alias{as_nanoarrow_array_stream} \title{Convert an object to a nanoarrow array_stream} \usage{ as_nanoarrow_array_stream(x, ..., schema = NULL) } \arguments{ \item{x}{An object to convert to a array_stream} \item{...}{Passed to S3 methods} \item{schema}{An optional schema used to enforce conversion to a particular type. Defaults to \code{\link[=infer_nanoarrow_schema]{infer_nanoarrow_schema()}}.} } \value{ An object of class 'nanoarrow_array_stream' } \description{ In nanoarrow, an 'array stream' corresponds to the \verb{struct ArrowArrayStream} as defined in the Arrow C Stream interface. This object is used to represent a stream of \link[=as_nanoarrow_array]{arrays} with a common \link[=as_nanoarrow_schema]{schema}. This is similar to an \link[arrow:RecordBatchReader]{arrow::RecordBatchReader} except it can be used to represent a stream of any type (not just record batches). Note that a stream of record batches and a stream of non-nullable struct arrays are represented identically. Also note that array streams are mutable objects and are passed by reference and not by value. } \examples{ (stream <- as_nanoarrow_array_stream(data.frame(x = 1:5))) stream$get_schema() stream$get_next() # The last batch is returned as NULL stream$get_next() # Release the stream stream$release() } nanoarrow/man/nanoarrow_version.Rd0000644000176200001440000000100314355103326017071 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nanoarrow-package.R \name{nanoarrow_version} \alias{nanoarrow_version} \title{Underlying 'nanoarrow' C library build} \usage{ nanoarrow_version(runtime = TRUE) } \arguments{ \item{runtime}{Compare TRUE and FALSE values to detect a possible ABI mismatch.} } \value{ A string identifying the version of nanoarrow this package was compiled against. } \description{ Underlying 'nanoarrow' C library build } \examples{ nanoarrow_version() } nanoarrow/man/as_nanoarrow_schema.Rd0000644000176200001440000000251214547061553017345 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/schema.R \name{as_nanoarrow_schema} \alias{as_nanoarrow_schema} \alias{infer_nanoarrow_schema} \alias{nanoarrow_schema_parse} \alias{nanoarrow_schema_modify} \title{Convert an object to a nanoarrow schema} \usage{ as_nanoarrow_schema(x, ...) infer_nanoarrow_schema(x, ...) nanoarrow_schema_parse(x, recursive = FALSE) nanoarrow_schema_modify(x, new_values, validate = TRUE) } \arguments{ \item{x}{An object to convert to a schema} \item{...}{Passed to S3 methods} \item{recursive}{Use \code{TRUE} to include a \code{children} member when parsing schemas.} \item{new_values}{New schema component to assign} \item{validate}{Use \code{FALSE} to skip schema validation} } \value{ An object of class 'nanoarrow_schema' } \description{ In nanoarrow a 'schema' refers to a \verb{struct ArrowSchema} as defined in the Arrow C Data interface. This data structure can be used to represent an \code{\link[arrow:schema]{arrow::schema()}}, an \code{\link[arrow:Field]{arrow::field()}}, or an \code{arrow::DataType}. Note that in nanoarrow, an \code{\link[arrow:schema]{arrow::schema()}} and a non-nullable \code{\link[arrow:data-type]{arrow::struct()}} are represented identically. } \examples{ infer_nanoarrow_schema(integer()) infer_nanoarrow_schema(data.frame(x = integer())) } nanoarrow/man/nanoarrow_buffer_init.Rd0000644000176200001440000000250714502402506017706 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/buffer.R \name{nanoarrow_buffer_init} \alias{nanoarrow_buffer_init} \alias{nanoarrow_buffer_append} \alias{convert_buffer} \title{Create and modify nanoarrow buffers} \usage{ nanoarrow_buffer_init() nanoarrow_buffer_append(buffer, new_buffer) convert_buffer(buffer, to = NULL) } \arguments{ \item{buffer, new_buffer}{\link[=as_nanoarrow_buffer]{nanoarrow_buffer}s.} \item{to}{A target prototype object describing the type to which \code{array} should be converted, or \code{NULL} to use the default conversion as returned by \code{\link[=infer_nanoarrow_ptype]{infer_nanoarrow_ptype()}}. Alternatively, a function can be passed to perform an alternative calculation of the default ptype as a function of \code{array} and the default inference of the prototype.} } \value{ \itemize{ \item \code{nanoarrow_buffer_init()}: An object of class 'nanoarrow_buffer' \item \code{nanoarrow_buffer_append()}: Returns \code{buffer}, invisibly. Note that \code{buffer} is modified in place by reference. } } \description{ Create and modify nanoarrow buffers } \examples{ buffer <- nanoarrow_buffer_init() nanoarrow_buffer_append(buffer, 1:5) array <- nanoarrow_array_modify( nanoarrow_array_init(na_int32()), list(length = 5, buffers = list(NULL, buffer)) ) as.vector(array) } nanoarrow/man/nanoarrow_extension_spec.Rd0000644000176200001440000000272114502402562020440 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/extension.R \name{nanoarrow_extension_spec} \alias{nanoarrow_extension_spec} \alias{register_nanoarrow_extension} \alias{unregister_nanoarrow_extension} \alias{resolve_nanoarrow_extension} \title{Register Arrow extension types} \usage{ nanoarrow_extension_spec(data = list(), subclass = character()) register_nanoarrow_extension(extension_name, extension_spec) unregister_nanoarrow_extension(extension_name) resolve_nanoarrow_extension(extension_name) } \arguments{ \item{data}{Optional data to include in the extension type specification} \item{subclass}{A subclass for the extension type specification. Extension methods will dispatch on this object.} \item{extension_name}{An Arrow extension type name (e.g., arrow.r.vctrs)} \item{extension_spec}{An extension specification inheriting from 'nanoarrow_extension_spec'.} } \value{ \itemize{ \item \code{nanoarrow_extension_spec()} returns an object of class 'nanoarrow_extension_spec'. \item \code{register_nanoarrow_extension()} returns \code{extension_spec}, invisibly. \item \code{unregister_nanoarrow_extension()} returns \code{extension_name}, invisibly. \item \code{resolve_nanoarrow_extension()} returns an object of class 'nanoarrow_extension_spec' or NULL if the extension type was not registered. } } \description{ Register Arrow extension types } \examples{ nanoarrow_extension_spec("mynamespace.mytype", subclass = "mypackage_mytype_spec") } nanoarrow/man/infer_nanoarrow_ptype.Rd0000644000176200001440000000333514355103326017742 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/infer-ptype.R \name{infer_nanoarrow_ptype} \alias{infer_nanoarrow_ptype} \title{Infer an R vector prototype} \usage{ infer_nanoarrow_ptype(x) } \arguments{ \item{x}{A \link[=as_nanoarrow_schema]{nanoarrow_schema}, \link[=as_nanoarrow_array]{nanoarrow_array}, or \link[=as_nanoarrow_array_stream]{nanoarrow_array_stream}.} } \value{ An R vector of zero size describing the target into which the array should be materialized. } \description{ Resolves the default \code{to} value to use in \code{\link[=convert_array]{convert_array()}} and \code{\link[=convert_array_stream]{convert_array_stream()}}. The default conversions are: } \details{ \itemize{ \item null to \code{\link[vctrs:unspecified]{vctrs::unspecified()}} \item boolean to \code{\link[=logical]{logical()}} \item int8, uint8, int16, uint16, and int13 to \code{\link[=integer]{integer()}} \item uint32, int64, uint64, float, and double to \code{\link[=double]{double()}} \item string and large string to \code{\link[=character]{character()}} \item struct to \code{\link[=data.frame]{data.frame()}} \item binary and large binary to \code{\link[blob:blob]{blob::blob()}} \item list, large_list, and fixed_size_list to \code{\link[vctrs:list_of]{vctrs::list_of()}} \item time32 and time64 to \code{\link[hms:hms]{hms::hms()}} \item duration to \code{\link[=difftime]{difftime()}} \item date32 to \code{\link[=as.Date]{as.Date()}} \item timestamp to \code{\link[=as.POSIXct]{as.POSIXct()}} } Additional conversions are possible by specifying an explicit value for \code{to}. For details of each conversion, see \code{\link[=convert_array]{convert_array()}}. } \examples{ infer_nanoarrow_ptype(as_nanoarrow_array(1:10)) } nanoarrow/man/nanoarrow_array_init.Rd0000644000176200001440000000425014377444470017570 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/array.R \name{nanoarrow_array_init} \alias{nanoarrow_array_init} \alias{nanoarrow_array_set_schema} \alias{nanoarrow_array_modify} \title{Modify nanoarrow arrays} \usage{ nanoarrow_array_init(schema) nanoarrow_array_set_schema(array, schema, validate = TRUE) nanoarrow_array_modify(array, new_values, validate = TRUE) } \arguments{ \item{schema}{A \link[=as_nanoarrow_schema]{nanoarrow_schema} to attach to this \code{array}.} \item{array}{A \link[=as_nanoarrow_array]{nanoarrow_array}.} \item{validate}{Use \code{FALSE} to skip validation. Skipping validation may result in creating an array that will crash R.} \item{new_values}{A named \code{list()} of values to replace.} } \value{ \itemize{ \item \code{nanoarrow_array_init()} returns a possibly invalid but initialized array with a given \code{schema}. \item \code{nanoarrow_array_set_schema()} returns \code{array}, invisibly. Note that \code{array} is modified in place by reference. \item \code{nanoarrow_array_modify()} returns a shallow copy of \code{array} with the modified parameters such that the original array remains valid. } } \description{ Create a new array or from an existing array, modify one or more parameters. When importing an array from elsewhere, \code{nanoarrow_array_set_schema()} is useful to attach the data type information to the array (without this information there is little that nanoarrow can do with the array since its content cannot be otherwise interpreted). \code{nanoarrow_array_modify()} can create a shallow copy and modify various parameters to create a new array, including setting children and buffers recursively. These functions power the \verb{$<-} operator, which can modify one parameter at a time. } \examples{ nanoarrow_array_init(na_string()) # Modify an array using $ and <- array <- as_nanoarrow_array(1:5) array$length <- 4 as.vector(array) # Modify potentially more than one component at a time array <- as_nanoarrow_array(1:5) as.vector(nanoarrow_array_modify(array, list(length = 4))) # Attach a schema to an array array <- as_nanoarrow_array(-1L) nanoarrow_array_set_schema(array, na_uint32()) as.vector(array) } nanoarrow/man/as_nanoarrow_array.Rd0000644000176200001440000000210314502402562017205 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/array.R \name{as_nanoarrow_array} \alias{as_nanoarrow_array} \title{Convert an object to a nanoarrow array} \usage{ as_nanoarrow_array(x, ..., schema = NULL) } \arguments{ \item{x}{An object to convert to a array} \item{...}{Passed to S3 methods} \item{schema}{An optional schema used to enforce conversion to a particular type. Defaults to \code{\link[=infer_nanoarrow_schema]{infer_nanoarrow_schema()}}.} } \value{ An object of class 'nanoarrow_array' } \description{ In nanoarrow an 'array' refers to the \verb{struct ArrowArray} definition in the Arrow C data interface. At the R level, we attach a \link[=as_nanoarrow_schema]{schema} such that functionally the nanoarrow_array class can be used in a similar way as an \code{arrow::Array}. Note that in nanoarrow an \code{arrow::RecordBatch} and a non-nullable \code{arrow::StructArray} are represented identically. } \examples{ (array <- as_nanoarrow_array(1:5)) as.vector(array) (array <- as_nanoarrow_array(data.frame(x = 1:5))) as.data.frame(array) } nanoarrow/man/array_stream_set_finalizer.Rd0000644000176200001440000000246114547575511020752 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/array-stream.R \name{array_stream_set_finalizer} \alias{array_stream_set_finalizer} \title{Register an array stream finalizer} \usage{ array_stream_set_finalizer(array_stream, finalizer) } \arguments{ \item{array_stream}{A \link[=as_nanoarrow_array_stream]{nanoarrow_array_stream}} \item{finalizer}{A function that will be called with zero arguments.} } \value{ A newly allocated \code{array_stream} whose release callback will call the supplied finalizer. } \description{ In some cases, R functions that return a \link[=as_nanoarrow_array_stream]{nanoarrow_array_stream} may require that the scope of some other object outlive that of the array stream. If there is a need for that object to be released deterministically (e.g., to close open files), you can register a function to run after the stream's release callback is invoked from the R thread. Note that this finalizer will \strong{not} be run if the stream's release callback is invoked from a \strong{non}-R thread. In this case, the finalizer and its chain of environments will be garbage-collected when \code{nanoarrow:::preserved_empty()} is run. } \examples{ stream <- array_stream_set_finalizer( basic_array_stream(list(1:5)), function() message("All done!") ) stream$release() } nanoarrow/man/na_vctrs.Rd0000644000176200001440000000233114502402562015140 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/extension-vctrs.R \name{na_vctrs} \alias{na_vctrs} \title{Vctrs extension type} \usage{ na_vctrs(ptype, storage_type = NULL) } \arguments{ \item{ptype}{A vctrs prototype as returned by \code{\link[vctrs:vec_ptype]{vctrs::vec_ptype()}}. The prototype can be of arbitrary size, but a zero-size vector is sufficient here.} \item{storage_type}{For \code{\link[=na_extension]{na_extension()}}, the underlying value type.} } \value{ A \link[=as_nanoarrow_schema]{nanoarrow_schema}. } \description{ The Arrow format provides a rich type system that can handle most R vector types; however, many R vector types do not roundtrip perfectly through Arrow memory. The vctrs extension type uses \code{\link[vctrs:vec_data]{vctrs::vec_data()}}, \code{\link[vctrs:vec_proxy]{vctrs::vec_restore()}}, and \code{\link[vctrs:vec_ptype]{vctrs::vec_ptype()}} in calls to \code{\link[=as_nanoarrow_array]{as_nanoarrow_array()}} and \code{\link[=convert_array]{convert_array()}} to ensure roundtrip fidelity. } \examples{ vctr <- as.POSIXlt("2000-01-02 03:45", tz = "UTC") array <- as_nanoarrow_array(vctr, schema = na_vctrs(vctr)) infer_nanoarrow_ptype(array) convert_array(array) } nanoarrow/man/nanoarrow-package.Rd0000644000176200001440000000222214440676475016740 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nanoarrow-package.R \docType{package} \name{nanoarrow-package} \alias{nanoarrow} \alias{nanoarrow-package} \title{nanoarrow: Interface to the 'nanoarrow' 'C' Library} \description{ Provides an 'R' interface to the 'nanoarrow' 'C' library and the 'Apache Arrow' application binary interface. Functions to import and export 'ArrowArray', 'ArrowSchema', and 'ArrowArrayStream' 'C' structures to and from 'R' objects are provided alongside helpers to facilitate zero-copy data transfer among 'R' bindings to libraries implementing the 'Arrow' 'C' data interface. } \seealso{ Useful links: \itemize{ \item \url{https://github.com/apache/arrow-nanoarrow} \item Report bugs at \url{https://github.com/apache/arrow-nanoarrow/issues} } } \author{ \strong{Maintainer}: Dewey Dunnington \email{dewey@dunnington.ca} (\href{https://orcid.org/0000-0002-9415-4582}{ORCID}) Authors: \itemize{ \item Apache Arrow \email{dev@arrow.apache.org} [copyright holder] } Other contributors: \itemize{ \item Apache Software Foundation \email{dev@arrow.apache.org} [copyright holder] } } \keyword{internal} nanoarrow/man/nanoarrow_pointer_is_valid.Rd0000644000176200001440000001125614502402506020745 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/pointers.R \name{nanoarrow_pointer_is_valid} \alias{nanoarrow_pointer_is_valid} \alias{nanoarrow_pointer_addr_dbl} \alias{nanoarrow_pointer_addr_chr} \alias{nanoarrow_pointer_addr_pretty} \alias{nanoarrow_pointer_release} \alias{nanoarrow_pointer_move} \alias{nanoarrow_pointer_export} \alias{nanoarrow_allocate_schema} \alias{nanoarrow_allocate_array} \alias{nanoarrow_allocate_array_stream} \alias{nanoarrow_pointer_set_protected} \title{Danger zone: low-level pointer operations} \usage{ nanoarrow_pointer_is_valid(ptr) nanoarrow_pointer_addr_dbl(ptr) nanoarrow_pointer_addr_chr(ptr) nanoarrow_pointer_addr_pretty(ptr) nanoarrow_pointer_release(ptr) nanoarrow_pointer_move(ptr_src, ptr_dst) nanoarrow_pointer_export(ptr_src, ptr_dst) nanoarrow_allocate_schema() nanoarrow_allocate_array() nanoarrow_allocate_array_stream() nanoarrow_pointer_set_protected(ptr_src, protected) } \arguments{ \item{ptr, ptr_src, ptr_dst}{An external pointer to a \verb{struct ArrowSchema}, \verb{struct ArrowArray}, or \verb{struct ArrowArrayStream}.} \item{protected}{An object whose scope must outlive that of \code{ptr}. This is useful for array streams since at least two specifications involving the array stream specify that the stream is only valid for the lifecycle of another object (e.g., an AdbcStatement or OGRDataset).} } \value{ \itemize{ \item \code{nanoarrow_pointer_is_valid()} returns TRUE if the pointer is non-null and has a non-null release callback. \item \code{nanoarrow_pointer_addr_dbl()} and \code{nanoarrow_pointer_addr_chr()} return pointer representations that may be helpful to facilitate moving or exporting nanoarrow objects to other libraries. \item \code{nanoarrow_pointer_addr_pretty()} gives a pointer representation suitable for printing or error messages. \item \code{nanoarrow_pointer_release()} returns \code{ptr}, invisibly. \item \code{nanoarrow_pointer_move()} and \code{nanoarrow_pointer_export()} reeturn \code{ptr_dst}, invisibly. \item \code{nanoarrow_allocate_array()}, \code{nanoarrow_allocate_schema()}, and \code{nanoarrow_allocate_array_stream()} return an \link[=as_nanoarrow_array]{array}, a \link[=as_nanoarrow_schema]{schema}, and an \link[=as_nanoarrow_array_stream]{array stream}, respectively. } } \description{ The \link[=as_nanoarrow_schema]{nanoarrow_schema}, \link[=as_nanoarrow_array]{nanoarrow_array}, and \link[=as_nanoarrow_array_stream]{nanoarrow_array_stream} classes are represented in R as external pointers (\code{EXTPTRSXP}). When these objects go out of scope (i.e., when they are garbage collected or shortly thereafter), the underlying object's \code{release()} callback is called if the underlying pointer is non-null and if the \code{release()} callback is non-null. } \details{ When interacting with other C Data Interface implementations, it is important to keep in mind that the R object wrapping these pointers is always passed by reference (because it is an external pointer) and may be referred to by another R object (e.g., an element in a \code{list()} or as a variable assigned in a user's environment). When importing a schema, array, or array stream into nanoarrow this is not a problem: the R object takes ownership of the lifecycle and memory is released when the R object is garbage collected. In this case, one can use \code{\link[=nanoarrow_pointer_move]{nanoarrow_pointer_move()}} where \code{ptr_dst} was created using \verb{nanoarrow_allocate_*()}. The case of exporting is more complicated and as such has a dedicated function, \code{\link[=nanoarrow_pointer_export]{nanoarrow_pointer_export()}}, that implements different logic schemas, arrays, and array streams: \itemize{ \item Schema objects are (deep) copied such that a fresh copy of the schema is exported and made the responsibility of some other C data interface implementation. \item Array objects are exported as a shell around the original array that preserves a reference to the R object. This ensures that the buffers and children pointed to by the array are not copied and that any references to the original array are not invalidated. \item Array stream objects are moved: the responsibility for the object is transferred to the other C data interface implementation and any references to the original R object are invalidated. Because these objects are mutable, this is typically what you want (i.e., you should not be pulling arrays from a stream accidentally from two places). } If you know the lifecycle of your object (i.e., you created the R object yourself and never passed references to it elsewhere), you can slightly more efficiently call \code{\link[=nanoarrow_pointer_move]{nanoarrow_pointer_move()}} for all three pointer types. } nanoarrow/man/convert_array_stream.Rd0000644000176200001440000000423114502402562017553 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/convert-array-stream.R \name{convert_array_stream} \alias{convert_array_stream} \alias{collect_array_stream} \title{Convert an Array Stream into an R vector} \usage{ convert_array_stream(array_stream, to = NULL, size = NULL, n = Inf) collect_array_stream(array_stream, n = Inf, schema = NULL, validate = TRUE) } \arguments{ \item{array_stream}{A \link[=as_nanoarrow_array_stream]{nanoarrow_array_stream}.} \item{to}{A target prototype object describing the type to which \code{array} should be converted, or \code{NULL} to use the default conversion as returned by \code{\link[=infer_nanoarrow_ptype]{infer_nanoarrow_ptype()}}. Alternatively, a function can be passed to perform an alternative calculation of the default ptype as a function of \code{array} and the default inference of the prototype.} \item{size}{The exact size of the output, if known. If specified, slightly more efficient implementation may be used to collect the output.} \item{n}{The maximum number of batches to pull from the array stream.} \item{schema}{A \link[=as_nanoarrow_schema]{nanoarrow_schema} or \code{NULL} to guess based on the first schema.} \item{validate}{Use \code{FALSE} to skip the validation step (i.e., if you know that the arrays are valid).} } \value{ \itemize{ \item \code{convert_array_stream()}: An R vector of type \code{to}. \item \code{collect_array_stream()}: A \code{list()} of \link[=as_nanoarrow_array]{nanoarrow_array} } } \description{ Converts \code{array_stream} to the type specified by \code{to}. This is a low-level interface; most users should use \code{as.data.frame()} or \code{as.vector()} unless finer-grained control is needed over the conversion. See \code{\link[=convert_array]{convert_array()}} for details of the conversion process; see \code{\link[=infer_nanoarrow_ptype]{infer_nanoarrow_ptype()}} for default inferences of \code{to}. } \examples{ stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) str(convert_array_stream(stream)) str(convert_array_stream(stream, to = data.frame(x = double()))) stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) collect_array_stream(stream) } nanoarrow/DESCRIPTION0000644000176200001440000000276714557000642014005 0ustar liggesusersPackage: nanoarrow Title: Interface to the 'nanoarrow' 'C' Library Version: 0.4.0 Authors@R: c( person(given = "Dewey", family = "Dunnington", role = c("aut", "cre"), email = "dewey@dunnington.ca", comment = c(ORCID = "0000-0002-9415-4582")), person("Apache Arrow", email = "dev@arrow.apache.org", role = c("aut", "cph")), person("Apache Software Foundation", email = "dev@arrow.apache.org", role = c("cph")) ) Description: Provides an 'R' interface to the 'nanoarrow' 'C' library and the 'Apache Arrow' application binary interface. Functions to import and export 'ArrowArray', 'ArrowSchema', and 'ArrowArrayStream' 'C' structures to and from 'R' objects are provided alongside helpers to facilitate zero-copy data transfer among 'R' bindings to libraries implementing the 'Arrow' 'C' data interface. License: Apache License (>= 2) Encoding: UTF-8 RoxygenNote: 7.2.3 URL: https://github.com/apache/arrow-nanoarrow BugReports: https://github.com/apache/arrow-nanoarrow/issues Suggests: arrow (>= 9.0.0), bit64, blob, hms, rlang, testthat (>= 3.0.0), tibble, vctrs, withr Config/testthat/edition: 3 Config/build/bootstrap: TRUE NeedsCompilation: yes Packaged: 2024-02-01 20:13:43 UTC; deweydunnington Author: Dewey Dunnington [aut, cre] (), Apache Arrow [aut, cph], Apache Software Foundation [cph] Maintainer: Dewey Dunnington Repository: CRAN Date/Publication: 2024-02-01 20:40:02 UTC nanoarrow/tests/0000755000176200001440000000000014307221533013422 5ustar liggesusersnanoarrow/tests/testthat/0000755000176200001440000000000014557000642015265 5ustar liggesusersnanoarrow/tests/testthat/test-extension-vctrs.R0000644000176200001440000000631414502402562021541 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("vctrs extension type can roundtrip built-in vector types", { skip_if_not_installed("tibble") # Arrow tibbleifies everything, so we do here too # Lists aren't automatically handled in nanoarrow conversion, so they # aren't listed here yet. vectors <- list( lgl = c(FALSE, TRUE, NA), int = c(0L, 1L, NA_integer_), dbl = c(0, 1, NA_real_), chr = c("a", NA_character_), posixct = as.POSIXct("2000-01-01 12:23", tz = "UTC"), posixlt = as.POSIXlt("2000-01-01 12:23", tz = "UTC"), date = as.Date("2000-01-01"), difftime = as.difftime(123, units = "secs"), data_frame_simple = tibble::tibble(x = 1:5), data_frame_nested = tibble::tibble(x = 1:5, y = tibble::tibble(z = letters[1:5])) ) for (nm in names(vectors)) { vctr <- vectors[[nm]] ptype <- vctrs::vec_ptype(vctr) schema <- na_vctrs(vctr) array <- as_nanoarrow_array(vctr, schema = schema) array_schema <- infer_nanoarrow_schema(array) # Roundtrip through convert_array() expect_true(nanoarrow_schema_identical(array_schema, schema)) expect_identical(infer_nanoarrow_ptype(array), ptype) expect_identical(convert_array(array), vctr) # Roundtrip with an empty array stream stream <- basic_array_stream(list(), schema = schema) expect_identical(convert_array_stream(stream), ptype) # Roundtrip with multiple chunks stream <- basic_array_stream(list(array, array)) expect_identical(convert_array_stream(stream), vctrs::vec_rep(vctr, 2)) if (requireNamespace("arrow", quietly = TRUE)) { # Roundtrip from nanoarrow -> arrow -> R arrow_array <- arrow::as_arrow_array(array) expect_s3_class(arrow_array, "ExtensionArray") expect_identical(arrow_array$type$ptype(), ptype) expect_identical(arrow_array$as_vector(), vctr) # Roundtrip from arrow -> nanoarrow -> R arrow_array <- arrow::vctrs_extension_array(vctr) array <- as_nanoarrow_array(vctr, schema = schema) expect_identical(infer_nanoarrow_ptype(array), ptype) expect_identical(convert_array(array), vctr) } } }) test_that("vctrs extension type respects `to` in convert_array()", { skip_if_not_installed("vctrs") vctr <- as.Date("2000-01-01") array <- as_nanoarrow_array(vctr, schema = na_vctrs(vctr)) expect_identical(convert_array(array), vctr) expect_identical( convert_array(array, to = as.POSIXct(character())), vctrs::vec_cast(vctr, as.POSIXct(character())) ) }) nanoarrow/tests/testthat/test-schema.R0000644000176200001440000004555014555060376017646 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("nanoarrow_schema format, print, and str methods work", { schema <- na_int32() expect_identical(format(schema), "") expect_output(expect_identical(str(schema), schema), "nanoarrow_schema") expect_output(expect_identical(print(schema), schema), "nanoarrow_schema") }) test_that("nanoarrow_schema format, print, and str methods work for invalid pointers", { schema <- nanoarrow_allocate_schema() expect_identical(format(schema), "") expect_output(expect_identical(str(schema), schema), "nanoarrow_schema") expect_output(expect_identical(print(schema), schema), "nanoarrow_schema") }) test_that("as_nanoarrow_schema() works for nanoarrow_schema", { schema <- na_int32() expect_identical(as_nanoarrow_schema(schema), schema) }) test_that("infer_nanoarrow_schema() errors for unsupported types", { expect_error( infer_nanoarrow_schema(environment()), "Can't infer Arrow type" ) }) test_that("infer_nanoarrow_schema() methods work for built-in types", { expect_identical(infer_nanoarrow_schema(raw())$format, "C") expect_identical(infer_nanoarrow_schema(logical())$format, "b") expect_identical(infer_nanoarrow_schema(integer())$format, "i") expect_identical(infer_nanoarrow_schema(double())$format, "g") expect_identical(infer_nanoarrow_schema(character())$format, "u") expect_identical(infer_nanoarrow_schema(Sys.Date())$format, "tdD") expect_identical(infer_nanoarrow_schema(factor())$format, "i") expect_identical(infer_nanoarrow_schema(factor())$dictionary$format, "u") time <- as.POSIXct("2000-01-01", tz = "UTC") expect_identical(infer_nanoarrow_schema(time)$format, "tsu:UTC") # Some systems (mostly Docker images) don't return a value for Sys.timezone() # so set one explicitly to test the Sys.timezone() fallback. withr::with_timezone("America/Halifax", { time <- as.POSIXct("2000-01-01", tz = "") expect_identical( infer_nanoarrow_schema(time)$format, paste0("tsu:", Sys.timezone()) ) }) difftime <- as.difftime(double(), unit = "secs") expect_identical(infer_nanoarrow_schema(difftime)$format, "tDu") df_schema <- infer_nanoarrow_schema(data.frame(x = 1L)) expect_identical(df_schema$format, "+s") expect_identical(df_schema$children$x$format, "i") }) test_that("infer_nanoarrow_schema() methods work for blob type", { skip_if_not_installed("blob") expect_identical(infer_nanoarrow_schema(blob::blob())$format, "z") }) test_that("infer_nanoarrow_schema() methods work for hms type", { skip_if_not_installed("hms") expect_identical(infer_nanoarrow_schema(hms::hms())$format, "ttm") }) test_that("infer_nanoarrow_schema() methods work for vctrs types", { skip_if_not_installed("vctrs") expect_identical(infer_nanoarrow_schema(vctrs::unspecified())$format, "n") list_schema <- infer_nanoarrow_schema(vctrs::list_of(.ptype = integer())) expect_identical(list_schema$format, "+l") expect_identical(list_schema$children[[1]]$format, "i") }) test_that("infer_nanoarrow_schema() method works for integer64()", { skip_if_not_installed("bit64") expect_identical(infer_nanoarrow_schema(bit64::integer64())$format, "l") }) test_that("infer_nanoarrow_schema() method works for AsIs", { expect_identical( infer_nanoarrow_schema(I(integer()))$format, infer_nanoarrow_schema(integer())$format ) }) test_that("infer_nanoarrow_schema() returns list of null for empty or all null list", { expect_identical(infer_nanoarrow_schema(list())$format, "+l") expect_identical(infer_nanoarrow_schema(list())$children[[1]]$format, "n") expect_identical(infer_nanoarrow_schema(list(NULL))$format, "+l") expect_identical(infer_nanoarrow_schema(list())$children[[1]]$format, "n") }) test_that("infer_nanoarrow_schema() returns binary for list of raw", { expect_identical(infer_nanoarrow_schema(list(raw()))$format, "z") expect_identical(infer_nanoarrow_schema(list(raw(), NULL))$format, "z") }) test_that("nanoarrow_schema_parse() works", { simple_info <- nanoarrow_schema_parse(na_int32()) expect_identical(simple_info$type, "int32") expect_identical(simple_info$storage_type, "int32") fixed_size_info <- nanoarrow_schema_parse(na_fixed_size_binary(1234)) expect_identical(fixed_size_info$fixed_size, 1234L) decimal_info <- nanoarrow_schema_parse(na_decimal128(4, 5)) expect_identical(decimal_info$decimal_bitwidth, 128L) expect_identical(decimal_info$decimal_precision, 4L) expect_identical(decimal_info$decimal_scale, 5L) time_unit_info <- nanoarrow_schema_parse(na_time32("s")) expect_identical(time_unit_info$time_unit, "s") timezone_info <- nanoarrow_schema_parse(na_timestamp("s", "America/Halifax")) expect_identical(timezone_info$timezone, "America/Halifax") recursive_info <- nanoarrow_schema_parse( na_struct(list(x = na_int32())), recursive = FALSE ) expect_null(recursive_info$children) recursive_info <- nanoarrow_schema_parse( na_struct(list(x = na_int32())), recursive = TRUE ) expect_length(recursive_info$children, 1L) expect_identical( recursive_info$children$x, nanoarrow_schema_parse(na_int32()) ) }) test_that("nanoarrow_schema_parse() works for extension types", { ext_info <- nanoarrow_schema_parse(na_extension(na_int32(), "ext_name", "ext_meta")) expect_identical(ext_info$type, "int32") expect_identical(ext_info$storage_type, "int32") expect_identical(ext_info$extension_name, "ext_name") expect_identical(ext_info$extension_metadata, charToRaw("ext_meta")) }) test_that("schema list interface works for non-nested types", { schema <- na_int32() expect_identical(length(schema), 6L) expect_identical( names(schema), c("format", "name", "metadata", "flags", "children", "dictionary") ) expect_identical(schema$format, "i") expect_identical(schema$name, "") expect_identical(schema$metadata, list()) expect_identical(schema$flags, 2L) expect_identical(schema$children, list()) expect_identical(schema$dictionary, NULL) }) test_that("schema list interface works for nested types", { schema <- na_struct(list(a = na_int32(), b = na_string())) expect_identical(schema$format, "+s") expect_named(schema$children, c("a", "b")) expect_identical(schema$children$a, schema$children[[1]]) expect_identical(schema$children$a$format, "i") expect_identical(schema$children$b$format, "u") expect_s3_class(schema$children$a, "nanoarrow_schema") expect_s3_class(schema$children$b, "nanoarrow_schema") info_recursive <- nanoarrow_schema_proxy(schema, recursive = TRUE) expect_type(info_recursive$children$a, "list") expect_identical(info_recursive$children$a$format, "i") }) test_that("schema list interface works for dictionary types", { schema <- na_dictionary(na_string(), na_int8()) expect_identical(schema$format, "c") expect_identical(schema$dictionary$format, "u") expect_s3_class(schema$dictionary, "nanoarrow_schema") info_recursive <- nanoarrow_schema_proxy(schema, recursive = TRUE) expect_type(info_recursive$dictionary, "list") expect_identical(info_recursive$dictionary$format, "u") }) test_that("schema list interface works with metadata", { schema <- na_extension(na_int32(), "ext_name", "ext_meta") expect_identical( schema$metadata[["ARROW:extension:name"]], "ext_name" ) expect_identical( schema$metadata[["ARROW:extension:metadata"]], "ext_meta" ) }) test_that("schema modify errors for invalid components", { schema <- na_int32() expect_error( nanoarrow_schema_modify(schema, list(1, 2, 3)), "`new_values`" ) expect_error( nanoarrow_schema_modify(schema, list(not_an_item = NULL)), "Can't modify schema" ) }) test_that("schema modify does not copy if length(new_values) == 0", { schema <- na_int32() expect_identical( nanoarrow_pointer_addr_chr(nanoarrow_schema_modify(schema, list())), nanoarrow_pointer_addr_chr(schema) ) }) test_that("schema modify can modify format", { schema <- na_int32() schema2 <- nanoarrow_schema_modify(schema, list(format = "I")) expect_identical(schema2$format, "I") expect_identical(schema2$name, schema$name) expect_identical(schema2$flags, schema$flags) expect_error( nanoarrow_schema_modify(schema, list(format = NULL)), "schema\\$format must be character" ) expect_error( nanoarrow_schema_modify(schema, list(format = character())), "schema\\$format must be character" ) }) test_that("schema modify can modify name", { schema <- na_int32() schema2 <- nanoarrow_schema_modify(schema, list(name = "new_name")) expect_identical(schema2$name, "new_name") expect_identical(schema2$format, schema$format) expect_identical(schema2$flags, schema$flags) schema2 <- nanoarrow_schema_modify(schema, list(name = NULL)) expect_null(schema2$name) expect_identical(schema2$format, schema$format) expect_identical(schema2$flags, schema$flags) expect_error( nanoarrow_schema_modify(schema, list(name = character())), "schema\\$name must be NULL or character" ) }) test_that("schema modify can modify flags", { schema <- na_int32() expect_identical(schema$flags, 2L) schema2 <- nanoarrow_schema_modify(schema, list(flags = 0)) expect_identical(schema2$flags, 0L) expect_identical(schema2$format, schema$format) expect_identical(schema2$name, schema$name) expect_error( nanoarrow_schema_modify(schema, list(flags = integer())), "schema\\$flags must be integer" ) }) test_that("schema modify can modify metadata", { schema <- na_int32() schema2 <- nanoarrow_schema_modify(schema, list(metadata = list())) expect_identical(schema2$metadata, list()) expect_identical(schema2$format, schema$format) schema3 <- nanoarrow_schema_modify(schema, list(metadata = NULL)) expect_identical(schema3$metadata, list()) expect_identical(schema3$format, schema$format) schema4 <- nanoarrow_schema_modify(schema, list(metadata = list(key = "value"))) expect_identical(schema4$metadata, list(key = "value")) expect_identical(schema4$format, schema$format) schema5 <- nanoarrow_schema_modify( schema, list(metadata = list(new_key = charToRaw("new value"))) ) expect_identical(schema5$metadata, list(new_key = "new value")) expect_identical(schema5$format, schema$format) expect_error( nanoarrow_schema_modify(schema, list(metadata = list(1))), "schema\\$metadata must be named" ) expect_error( nanoarrow_schema_modify(schema, list(metadata = setNames(list(1, 2), c("", "")))), "must be named" ) expect_error( nanoarrow_schema_modify(schema, list(metadata = setNames(list(1), NA_character_))), "must be named" ) expect_error( nanoarrow_schema_modify(schema, list(metadata = list(name = NULL))), "must be character\\(1\\) or raw" ) expect_error( nanoarrow_schema_modify(schema, list(metadata = list(name = character()))), "must be character\\(1\\) or raw" ) expect_error( nanoarrow_schema_modify(schema, list(metadata = list(name = NA_character_))), "must not be NA_character_" ) }) test_that("schema modify can modify children", { schema_without_children <- na_struct() child_to_be <- schema_without_children child_to_be$name <- "should not appear" # NULL children to NULL children schema2 <- nanoarrow_schema_modify( schema_without_children, list(children = NULL) ) expect_identical(schema2$children, list()) expect_identical(schema2$format, schema_without_children$format) # NULL children to zero-size list() children schema2 <- nanoarrow_schema_modify( schema_without_children, list(children = list()) ) expect_identical(schema2$children, list()) expect_identical(schema2$format, schema_without_children$format) # with unnamed child list schema2 <- nanoarrow_schema_modify( schema_without_children, list(children = list(child_to_be)) ) expect_length(schema2$children, 1) expect_named(schema2$children, "") expect_identical(schema2$format, schema_without_children$format) expect_identical(schema2$children[[1]]$format, child_to_be$format) # with another type of unnamed child list schema2 <- nanoarrow_schema_modify( schema_without_children, list(children = setNames(list(child_to_be), "")) ) expect_length(schema2$children, 1) expect_named(schema2$children, "") expect_identical(schema2$format, schema_without_children$format) expect_identical(schema2$children[[1]]$format, child_to_be$format) # with oddly unnamed child list schema2 <- nanoarrow_schema_modify( schema_without_children, list(children = setNames(list(child_to_be), NA_character_)) ) expect_length(schema2$children, 1) expect_named(schema2$children, "") expect_identical(schema2$format, schema_without_children$format) expect_identical(schema2$children[[1]]$format, child_to_be$format) # with a normal named child list schema2 <- nanoarrow_schema_modify( schema_without_children, list(children = list("a new name" = child_to_be)) ) expect_length(schema2$children, 1) expect_named(schema2$children, "a new name") expect_identical(schema2$format, schema_without_children$format) expect_identical(schema2$children[[1]]$format, child_to_be$format) schema_with_children <- na_struct(list(existing_name = na_string())) # some children to NULL children schema2 <- nanoarrow_schema_modify( schema_with_children, list(children = NULL) ) expect_identical(schema2$children, list()) expect_identical(schema2$format, schema_with_children$format) # replace identical number of children schema2 <- nanoarrow_schema_modify( schema_with_children, list(children = list("a new name" = child_to_be)) ) expect_length(schema2$children, 1) expect_named(schema2$children, "a new name") expect_identical(schema2$format, schema_with_children$format) expect_identical(schema2$children[[1]]$format, child_to_be$format) # replace with more children another_child_to_be <- na_bool() schema2 <- nanoarrow_schema_modify( schema_with_children, list( children = list( "a new name" = child_to_be, "another new name" = another_child_to_be ) ) ) expect_length(schema2$children, 2) expect_named(schema2$children, c("a new name", "another new name")) expect_identical(schema2$format, schema_with_children$format) expect_identical(schema2$children[[1]]$format, child_to_be$format) expect_identical(schema2$children[[2]]$format, another_child_to_be$format) }) test_that("schema modify can modify dictionary", { schema_without_dictionary <- na_int32() # NULL -> NULL schema2 <- nanoarrow_schema_modify( schema_without_dictionary, list(dictionary = NULL) ) expect_null(schema2$dictionary) expect_identical(schema2$flags, schema_without_dictionary$flags) expect_identical(schema2$format, schema_without_dictionary$format) expect_identical(schema2$name, schema_without_dictionary$name) # NULL -> non-null schema2 <- nanoarrow_schema_modify( schema_without_dictionary, list(dictionary = na_int32()) ) expect_identical(schema2$dictionary$format, "i") expect_identical(schema2$flags, schema_without_dictionary$flags) expect_identical(schema2$format, schema_without_dictionary$format) expect_identical(schema2$name, schema_without_dictionary$name) # non-null -> NULL schema_with_dictionary <- schema2 schema2 <- nanoarrow_schema_modify( schema_with_dictionary, list(dictionary = NULL) ) expect_null(schema2$dictionary) expect_identical(schema2$flags, schema_with_dictionary$flags) expect_identical(schema2$format, schema_with_dictionary$format) expect_identical(schema2$name, schema_with_dictionary$name) # non-null -> non-null schema2 <- nanoarrow_schema_modify( schema_with_dictionary, list(dictionary = na_string()) ) expect_identical(schema2$dictionary$format, "u") expect_identical(schema2$flags, schema_with_dictionary$flags) expect_identical(schema2$format, schema_with_dictionary$format) expect_identical(schema2$name, schema_with_dictionary$name) }) test_that("schema modify respects the validate flag", { schema <- na_int32() schema2 <- nanoarrow_schema_modify( schema, list(format = "totally invalid"), validate = FALSE ) expect_identical(schema2$format, "totally invalid") expect_error( nanoarrow_schema_modify( schema, list(format = "totally invalid"), validate = TRUE ), "Error parsing schema->format" ) }) test_that("[[<- works for schema", { schema <- na_int32() schema[["name"]] <- "a new name" expect_identical(schema$name, "a new name") schema <- na_int32() schema[[2]] <- "yet a new name" expect_identical(schema$name, "yet a new name") expect_error( schema[["not_an_item"]] <- "something", "Can't modify schema" ) expect_error( schema[[NA_character_]] <- "something", "must be character" ) expect_error( schema[[character()]] <- "something", "must be character" ) expect_error( schema[[NA_integer_]] <- "something", "must be character" ) expect_error( schema[[integer()]] <- "something", "must be character" ) expect_error( schema[[12]] <- "something", "must be character" ) }) test_that("$<- works for schema", { schema <- na_int32() schema$name <- "a new name" expect_identical(schema$name, "a new name") expect_error( schema$not_an_item <- "something", "Can't modify schema" ) }) test_that("<- assignment works for schema$children", { schema <- na_struct(list(col1 = na_int32(), col2 = na_string())) schema$children$col1 <- na_bool() expect_named(schema$children, c("col1", "col2")) expect_identical(schema$children$col1$format, "b") expect_identical(schema$children$col1$name, "col1") names(schema$children)[1] <- "col1_new" expect_named(schema$children, c("col1_new", "col2")) expect_identical(schema$children$col1_new$format, "b") expect_identical(schema$children$col1_new$name, "col1_new") }) test_that("<- assignment works for schema$metadata", { schema <- na_int32() schema$metadata$key <- "value" expect_identical(schema$metadata$key, "value") names(schema$metadata)[1] <- "new_key" expect_identical(schema$metadata$new_key, "value") schema$metadata$new_key <- "new value" expect_identical(schema$metadata$new_key, "new value") }) nanoarrow/tests/testthat/test-nanoarrow-package.R0000644000176200001440000000164414355103326021770 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("nanoarrow_version() works", { expect_identical( nanoarrow_version(runtime = TRUE), nanoarrow_version(runtime = FALSE) ) }) nanoarrow/tests/testthat/test-altrep.R0000644000176200001440000000753314402162315017657 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("nanoarrow_altrep_chr() returns NULL for unsupported types", { expect_null(nanoarrow_altrep_chr(as_nanoarrow_array(1:10))) expect_null(nanoarrow_altrep_chr(as_nanoarrow_array(1:10))) }) test_that("nanoarrow_altrep_chr() works for string", { x <- as_nanoarrow_array(c(NA, letters), schema = na_string()) x_altrep <- nanoarrow_altrep_chr(x) expect_output(.Internal(inspect(x_altrep)), "") # Check that some common operations that call the string elt method # don't materialize the vector expect_identical(x_altrep, c(NA, letters)) expect_length(x_altrep, 27) expect_false(is_nanoarrow_altrep_materialized(x_altrep)) # Setting an element will materialize, duplicate, then modify x_altrep2 <- x_altrep x_altrep2[1] <- "not a letter" expect_identical(x_altrep2, c("not a letter", letters)) expect_true(is_nanoarrow_altrep_materialized(x_altrep)) # Check the same operations on the materialized output expect_identical(x_altrep, c(NA, letters)) expect_length(x_altrep, 27) # Materialization should get printed in inspect() expect_output(.Internal(inspect(x_altrep)), "") # For good measure, force materialization again and check nanoarrow_altrep_force_materialize(x_altrep) expect_identical(x_altrep, c(NA, letters)) expect_length(x_altrep, 27) }) test_that("nanoarrow_altrep_chr() works for large string", { skip_if_not_installed("arrow") x <- as_nanoarrow_array(letters, schema = na_large_string()) x_altrep <- nanoarrow_altrep_chr(x) expect_identical(x_altrep, letters) }) test_that("is_nanoarrow_altrep() returns true for nanoarrow altrep objects", { expect_false(is_nanoarrow_altrep("not altrep")) expect_false(is_nanoarrow_altrep(1:10)) expect_true(is_nanoarrow_altrep(nanoarrow_altrep_chr(as_nanoarrow_array("whee")))) }) test_that("nanoarrow_altrep_chr_force_materialize() forces materialization", { x <- as_nanoarrow_array(letters, schema = na_string()) x_altrep <- nanoarrow_altrep_chr(x) expect_identical(nanoarrow_altrep_force_materialize("not altrep"), 0L) expect_identical(nanoarrow_altrep_force_materialize(x_altrep), 1L) x <- as_nanoarrow_array(letters, schema = na_string()) x_altrep_df <- data.frame(x = nanoarrow_altrep_chr(x), stringsAsFactors = FALSE) expect_identical( nanoarrow_altrep_force_materialize(x_altrep_df, recursive = FALSE), 0L ) expect_identical( nanoarrow_altrep_force_materialize(x_altrep_df, recursive = TRUE), 1L ) expect_identical( nanoarrow_altrep_force_materialize(x_altrep_df, recursive = TRUE), 0L ) }) test_that("is_nanoarrow_altrep_materialized() checks for materialization", { expect_identical(is_nanoarrow_altrep_materialized("not altrep"), NA) expect_identical(is_nanoarrow_altrep_materialized(1:10), NA) x <- as_nanoarrow_array(letters, schema = na_string()) x_altrep <- nanoarrow_altrep_chr(x) expect_false(is_nanoarrow_altrep_materialized(x_altrep)) expect_identical(nanoarrow_altrep_force_materialize(x_altrep), 1L) expect_true(is_nanoarrow_altrep_materialized(x_altrep)) }) nanoarrow/tests/testthat/test-buffer.R0000644000176200001440000000542514547575511017657 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("as_nanoarrow_buffer() works for nanoarrow_buffer", { buffer <- as_nanoarrow_array(1:4)$buffers[[2]] expect_identical(as_nanoarrow_buffer(buffer), buffer) }) test_that("as_nanoarrow_buffer() works for R atomic types", { buffer_null <- as_nanoarrow_buffer(NULL) expect_identical(as.raw(buffer_null), raw(0)) expect_identical(convert_buffer(buffer_null), blob::blob(raw(0))) expect_identical(as.vector(buffer_null), convert_buffer(buffer_null)) buffer_raw <- as_nanoarrow_buffer(as.raw(0x00)) expect_identical(as.raw(buffer_raw), raw(1)) expect_identical(convert_buffer(buffer_raw), blob::blob(as.raw(0x00))) buffer_lgl <- as_nanoarrow_buffer(FALSE) expect_identical(as.raw(buffer_lgl), raw(4)) expect_identical(convert_buffer(buffer_lgl), 0L) buffer_int <- as_nanoarrow_buffer(0L) expect_identical(as.raw(buffer_lgl), raw(4)) expect_identical(convert_buffer(buffer_lgl), 0L) buffer_dbl <- as_nanoarrow_buffer(0) expect_identical(as.raw(buffer_lgl), raw(4)) buffer_cplx <- as_nanoarrow_buffer(complex(real = 0, imaginary = 0)) expect_identical(as.raw(buffer_cplx), raw(16)) expect_identical(convert_buffer(buffer_cplx), c(0, 0)) buffer_chr <- as_nanoarrow_buffer("1234") expect_identical(as.raw(buffer_chr), charToRaw("1234")) expect_identical(convert_buffer(buffer_chr), "1234") }) test_that("buffers can be printed", { expect_snapshot(str(as_nanoarrow_buffer(1:10))) expect_snapshot(str(as_nanoarrow_buffer(1:10000))) expect_snapshot(str(as_nanoarrow_buffer(strrep("abcdefg", 100)))) expect_snapshot(str(as_nanoarrow_buffer(charToRaw(strrep("abcdefg", 100))))) array <- as_nanoarrow_array(1:100) nanoarrow_array_set_schema(array, NULL) expect_snapshot(str(array$buffers[[2]])) }) test_that("as_nanoarrow_buffer() errors for unsupported types", { expect_error( as_nanoarrow_buffer(NA_character_), "NA_character_ not supported" ) expect_error( as_nanoarrow_buffer(environment()), "Can't convert object of type environment to nanoarrow_buffer" ) }) nanoarrow/tests/testthat/test-pkg-arrow.R0000644000176200001440000003654714547061553020324 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("infer_type() for nanoarrow_array works", { skip_if_not_installed("arrow") array <- as_nanoarrow_array(1:5) expect_true( arrow::infer_type(array)$Equals(arrow::int32()) ) }) test_that("infer_nanoarrow_schema() works for arrow objects", { skip_if_not_installed("arrow") int_schema <- infer_nanoarrow_schema(arrow::Array$create(1:10)) expect_true(arrow::as_data_type(int_schema)$Equals(arrow::int32())) int_schema <- infer_nanoarrow_schema(arrow::Scalar$create(1L)) expect_true(arrow::as_data_type(int_schema)$Equals(arrow::int32())) int_schema <- infer_nanoarrow_schema(arrow::ChunkedArray$create(1:10)) expect_true(arrow::as_data_type(int_schema)$Equals(arrow::int32())) int_schema <- infer_nanoarrow_schema(arrow::Expression$scalar(1L)) expect_true(arrow::as_data_type(int_schema)$Equals(arrow::int32())) tbl_schema_expected <- arrow::schema(x = arrow::int32()) tbl_schema <- infer_nanoarrow_schema(arrow::record_batch(x = 1L)) expect_true(arrow::as_schema(tbl_schema)$Equals(tbl_schema_expected)) tbl_schema <- infer_nanoarrow_schema(arrow::arrow_table(x = 1L)) expect_true(arrow::as_schema(tbl_schema)$Equals(tbl_schema_expected)) tbl_schema <- infer_nanoarrow_schema( arrow::RecordBatchReader$create(arrow::record_batch(x = 1L)) ) expect_true(arrow::as_schema(tbl_schema)$Equals(tbl_schema_expected)) skip_if_not(isTRUE(arrow::arrow_info()$capabilities["dataset"])) tbl_schema <- infer_nanoarrow_schema( arrow::InMemoryDataset$create(arrow::record_batch(x = 1L)) ) expect_true(arrow::as_schema(tbl_schema)$Equals(tbl_schema_expected)) tbl_schema <- infer_nanoarrow_schema( arrow::Scanner$create( arrow::InMemoryDataset$create(arrow::record_batch(x = 1L)) ) ) expect_true(arrow::as_schema(tbl_schema)$Equals(tbl_schema_expected)) }) test_that("nanoarrow_array to Array works", { skip_if_not_installed("arrow") int <- arrow::as_arrow_array(as_nanoarrow_array(1:5)) expect_true(int$Equals(arrow::Array$create(1:5))) dbl <- arrow::as_arrow_array(as_nanoarrow_array(1:5, schema = arrow::float64())) expect_true(dbl$Equals(arrow::Array$create(1:5, type = arrow::float64()))) dbl_casted <- arrow::as_arrow_array(as_nanoarrow_array(1:5), type = arrow::float64()) expect_true(dbl_casted$Equals(arrow::Array$create(1:5, type = arrow::float64()))) chr <- arrow::as_arrow_array(as_nanoarrow_array(c("one", "two"))) expect_true(chr$Equals(arrow::Array$create(c("one", "two")))) }) test_that("nanoarrow_array to Array works for child arrays", { skip_if_not_installed("arrow") df <- data.frame(a = 1, b = "two") batch <- as_nanoarrow_array(df) # This type of export is special because batch$children[[2]] has an SEXP # dependency on the original array. When we export it, we reverse that # dependency such that the exported array and the batch->children[1] array # are shells that call nanoarrow_release_sexp on a common object (i.e., sort of like # a shared pointer). array_from_column <- arrow::as_arrow_array(batch$children[[2]]) # The exported array should be valid expect_null(array_from_column$Validate()) # All the nanoarrow pointers should still be valid expect_true(nanoarrow_pointer_is_valid(batch)) expect_true(nanoarrow_pointer_is_valid(batch$children[[1]])) expect_true(nanoarrow_pointer_is_valid(batch$children[[2]])) # Let the exported arrow::Array go out of scope and maximize the # chance that the exported data release callback is called array_from_column <- NULL gc() Sys.sleep(0.1) # All the nanoarrow pointers should *still* be valid even after that # release callback is called expect_true(nanoarrow_pointer_is_valid(batch)) expect_true(nanoarrow_pointer_is_valid(batch$children[[1]])) expect_true(nanoarrow_pointer_is_valid(batch$children[[2]])) # Export one column again but this time let the `batch` go out of scope array_from_column <- arrow::as_arrow_array(batch$children[[1]]) batch <- NULL gc() Sys.sleep(0.1) # The exported array should still be valid expect_null(array_from_column$Validate()) }) test_that("Array to nanoarrow_array works", { skip_if_not_installed("arrow") int <- arrow::Array$create(1:5) int_array <- as_nanoarrow_array(int) expect_s3_class(int_array, "nanoarrow_array") int_schema <- infer_nanoarrow_schema(int_array) expect_s3_class(int_schema, "nanoarrow_schema") expect_true( arrow::as_arrow_array(int_array)$Equals( arrow::Array$create(1:5) ) ) dbl_array <- as_nanoarrow_array(int, schema = arrow::float64()) expect_s3_class(dbl_array, "nanoarrow_array") dbl_schema <- infer_nanoarrow_schema(dbl_array) expect_s3_class(dbl_schema, "nanoarrow_schema") expect_true( arrow::as_arrow_array(dbl_array)$Equals( arrow::Array$create(1:5, type = arrow::float64()) ) ) }) test_that("nanoarrow_array to ChunkedArray works", { skip_if_not_installed("arrow") int <- arrow::as_chunked_array(as_nanoarrow_array(1:5)) expect_true(int$Equals(arrow::ChunkedArray$create(1:5))) dbl_casted <- arrow::as_chunked_array(as_nanoarrow_array(1:5), type = arrow::float64()) expect_true(dbl_casted$Equals(arrow::ChunkedArray$create(1:5, type = arrow::float64()))) }) test_that("ChunkedArray to nanoarrow_array works", { skip_if_not_installed("arrow") int <- arrow::ChunkedArray$create(1:5) int_array <- as_nanoarrow_array(int) expect_s3_class(int_array, "nanoarrow_array") int_schema <- infer_nanoarrow_schema(int_array) expect_s3_class(int_schema, "nanoarrow_schema") expect_true( arrow::as_chunked_array(int_array)$Equals( arrow::ChunkedArray$create(1:5) ) ) dbl_array <- as_nanoarrow_array(int, schema = arrow::float64()) expect_s3_class(dbl_array, "nanoarrow_array") dbl_schema <- infer_nanoarrow_schema(dbl_array) expect_s3_class(dbl_schema, "nanoarrow_schema") expect_true( arrow::as_chunked_array(dbl_array)$Equals( arrow::ChunkedArray$create(1:5, type = arrow::float64()) ) ) }) test_that("ChunkedArray to nanoarrow_array_stream works", { skip_if_not_installed("arrow") int <- arrow::ChunkedArray$create(1:5) int_array_stream <- as_nanoarrow_array_stream(int) expect_s3_class(int_array_stream, "nanoarrow_array_stream") expect_true( arrow::as_chunked_array(int_array_stream)$Equals( arrow::ChunkedArray$create(1:5) ) ) dbl_array_stream <- as_nanoarrow_array_stream(int, schema = arrow::float64()) expect_s3_class(dbl_array_stream, "nanoarrow_array_stream") expect_true( arrow::as_chunked_array(dbl_array_stream)$Equals( arrow::ChunkedArray$create(1:5, type = arrow::float64()) ) ) }) test_that("Array to nanoarrow_array_stream works", { skip_if_not_installed("arrow") int <- arrow::Array$create(1:5) int_array_stream <- as_nanoarrow_array_stream(int) expect_s3_class(int_array_stream, "nanoarrow_array_stream") expect_true( arrow::as_arrow_array(int_array_stream)$Equals( arrow::Array$create(1:5) ) ) dbl_array_stream <- as_nanoarrow_array_stream(int, schema = arrow::float64()) expect_s3_class(dbl_array_stream, "nanoarrow_array_stream") expect_true( arrow::as_arrow_array(dbl_array_stream)$Equals( arrow::Array$create(1:5, type = arrow::float64()) ) ) empty_array_stream <- basic_array_stream(list(), na_int32()) expect_true( arrow::as_arrow_array(empty_array_stream)$Equals( arrow::concat_arrays(type = arrow::int32()) ) ) }) test_that("nanoarrow_array to RecordBatch works", { skip_if_not_installed("arrow") df <- data.frame(a = 1:5, b = letters[1:5]) batch <- arrow::as_record_batch(as_nanoarrow_array(df)) expect_true( batch$Equals(arrow::record_batch(a = 1:5, b = letters[1:5])) ) batch_casted <- arrow::as_record_batch( as_nanoarrow_array(df), schema = arrow::schema(a = arrow::float64(), b = arrow::string()) ) expect_true( batch_casted$Equals( arrow::record_batch(a = as.double(1:5), b = letters[1:5]) ) ) }) test_that("RecordBatch to nanoarrow_array", { skip_if_not_installed("arrow") batch <- arrow::record_batch(a = 1:5, b = letters[1:5]) struct_array <- as_nanoarrow_array(batch) expect_s3_class(struct_array, "nanoarrow_array") struct_schema <- infer_nanoarrow_schema(struct_array) expect_s3_class(struct_schema, "nanoarrow_schema") expect_true( arrow::as_record_batch(struct_array)$Equals( arrow::record_batch(a = 1:5, b = letters[1:5]) ) ) struct_array_casted <- as_nanoarrow_array( batch, schema = arrow::schema(a = arrow::float64(), b = arrow::string()) ) expect_s3_class(struct_array_casted, "nanoarrow_array") struct_schema_casted <- infer_nanoarrow_schema(struct_array_casted) expect_s3_class(struct_schema_casted, "nanoarrow_schema") expect_true( arrow::as_record_batch(struct_array_casted)$Equals( arrow::record_batch(a = as.double(1:5), b = letters[1:5]) ) ) }) test_that("nanoarrow_array to Table works", { skip_if_not_installed("arrow") df <- data.frame(a = 1:5, b = letters[1:5]) table <- arrow::as_arrow_table(as_nanoarrow_array(df)) expect_true( table$Equals(arrow::arrow_table(a = 1:5, b = letters[1:5])) ) table_casted <- arrow::as_arrow_table( as_nanoarrow_array(df), schema = arrow::schema(a = arrow::float64(), b = arrow::string()) ) expect_true( table_casted$Equals( arrow::arrow_table(a = as.double(1:5), b = letters[1:5]) ) ) }) test_that("Table to nanoarrow_array", { skip_if_not_installed("arrow") table <- arrow::arrow_table(a = 1:5, b = letters[1:5]) struct_array <- as_nanoarrow_array(table) expect_s3_class(struct_array, "nanoarrow_array") struct_schema <- infer_nanoarrow_schema(struct_array) expect_s3_class(struct_schema, "nanoarrow_schema") expect_true( arrow::as_arrow_table(struct_array)$Equals( arrow::arrow_table(a = 1:5, b = letters[1:5]) ) ) struct_array_casted <- as_nanoarrow_array( table, schema = arrow::schema(a = arrow::float64(), b = arrow::string()) ) expect_s3_class(struct_array_casted, "nanoarrow_array") struct_schema_casted <- infer_nanoarrow_schema(struct_array_casted) expect_s3_class(struct_schema_casted, "nanoarrow_schema") expect_true( arrow::as_arrow_table(struct_array_casted)$Equals( arrow::arrow_table(a = as.double(1:5), b = letters[1:5]) ) ) }) test_that("Table to nanoarrow_array_stream works", { skip_if_not_installed("arrow") table <- arrow::arrow_table(a = 1:5, b = letters[1:5]) stream <- as_nanoarrow_array_stream(table) expect_s3_class(stream, "nanoarrow_array_stream") expect_true( arrow::as_arrow_table(stream)$Equals( arrow::arrow_table(a = 1:5, b = letters[1:5]) ) ) # Check cast in the stream -> table direction stream <- as_nanoarrow_array_stream(table) expect_true( arrow::as_arrow_table( stream, schema = arrow::schema(a = arrow::float64(), b = arrow::string()) )$Equals( arrow::arrow_table(a = as.double(1:5), b = letters[1:5]) ) ) # Check cast in the table -> stream direction stream_casted <- as_nanoarrow_array_stream( table, schema = arrow::schema(a = arrow::float64(), b = arrow::string()) ) expect_s3_class(stream_casted, "nanoarrow_array_stream") expect_true( arrow::as_arrow_table(stream_casted)$Equals( arrow::arrow_table(a = as.double(1:5), b = letters[1:5]) ) ) }) test_that("Dataset to nanoarrow_array_stream works", { skip_if_not_installed("arrow") skip_if_not(isTRUE(arrow::arrow_info()$capabilities["dataset"])) dataset <- arrow::InMemoryDataset$create(arrow::arrow_table(a = 1:5, b = letters[1:5])) stream <- as_nanoarrow_array_stream(dataset) expect_s3_class(stream, "nanoarrow_array_stream") expect_true( arrow::as_arrow_table(stream)$Equals( arrow::arrow_table(a = 1:5, b = letters[1:5]) ) ) }) test_that("Scanner to nanoarrow_array_stream works", { skip_if_not_installed("arrow") skip_if_not(isTRUE(arrow::arrow_info()$capabilities["dataset"])) dataset <- arrow::InMemoryDataset$create(arrow::arrow_table(a = 1:5, b = letters[1:5])) scanner <- arrow::Scanner$create(dataset) stream <- as_nanoarrow_array_stream(scanner) expect_s3_class(stream, "nanoarrow_array_stream") expect_true( arrow::as_arrow_table(stream)$Equals( arrow::arrow_table(a = 1:5, b = letters[1:5]) ) ) }) test_that("nanoarrow_schema to DataType works", { skip_if_not_installed("arrow") int_schema <- as_nanoarrow_schema(arrow::int32()) arrow_type <- arrow::as_data_type(int_schema) expect_true(arrow_type$Equals(arrow::int32())) }) test_that("DataType to nanoarrow_schema", { skip_if_not_installed("arrow") schema <- as_nanoarrow_schema(arrow::int32()) expect_s3_class(schema, "nanoarrow_schema") expect_true(arrow::as_data_type(schema)$Equals(arrow::int32())) }) test_that("Field to nanoarrow_schema", { skip_if_not_installed("arrow") schema <- as_nanoarrow_schema(arrow::field("name", arrow::int32())) expect_s3_class(schema, "nanoarrow_schema") expect_true(arrow::as_data_type(schema)$Equals(arrow::int32())) }) test_that("nanoarrow_schema to Schema works", { skip_if_not_installed("arrow") struct_schema <- as_nanoarrow_schema( arrow::struct(a = arrow::int32(), b = arrow::string()) ) arrow_schema <- arrow::as_schema(struct_schema) expect_true(arrow_schema$Equals(arrow::schema(a = arrow::int32(), b = arrow::string()))) }) test_that("Schema to nanoarrow_schema", { skip_if_not_installed("arrow") schema <- as_nanoarrow_schema(arrow::schema(name = arrow::int32())) expect_s3_class(schema, "nanoarrow_schema") expect_true(arrow::as_schema(schema)$Equals(arrow::schema(name = arrow::int32()))) }) test_that("nanoarrow_array_stream to RecordBatchReader works", { skip_if_not_installed("arrow") reader <- arrow::as_record_batch_reader( arrow::record_batch(a = 1:5, b = letters[1:5]) ) array_stream <- as_nanoarrow_array_stream(reader) reader_roundtrip <- arrow::as_record_batch_reader(array_stream) expect_false(nanoarrow_pointer_is_valid(array_stream)) expect_true( reader_roundtrip$read_next_batch()$Equals( arrow::record_batch(a = 1:5, b = letters[1:5]) ) ) expect_null(reader_roundtrip$read_next_batch()) }) test_that("RecordBatchReader to nanoarrow_array_stream works", { skip_if_not_installed("arrow") reader <- arrow::as_record_batch_reader( arrow::record_batch(a = 1:5, b = letters[1:5]) ) array_stream <- as_nanoarrow_array_stream(reader) expect_s3_class(array_stream, "nanoarrow_array_stream") reader_roundtrip <- arrow::as_record_batch_reader(array_stream) expect_true( reader_roundtrip$read_next_batch()$Equals( arrow::record_batch(a = 1:5, b = letters[1:5]) ) ) expect_null(reader_roundtrip$read_next_batch()) }) nanoarrow/tests/testthat/test-extension.R0000644000176200001440000000675114502402562020407 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("extension types can be registered and unregistered", { spec <- nanoarrow_extension_spec() register_nanoarrow_extension("some_ext", spec) expect_identical(resolve_nanoarrow_extension("some_ext"), spec) unregister_nanoarrow_extension("some_ext") expect_identical(resolve_nanoarrow_extension("some_ext"), NULL) }) test_that("infer_nanoarrow_ptype() dispatches on registered extension spec", { register_nanoarrow_extension( "some_ext", nanoarrow_extension_spec(subclass = "some_spec_class") ) on.exit(unregister_nanoarrow_extension("some_ext")) infer_nanoarrow_ptype_extension.some_spec_class <- function(spec, x, ...) { infer_nanoarrow_ptype_extension(NULL, x, ..., warn_unregistered = FALSE) } s3_register( "nanoarrow::infer_nanoarrow_ptype_extension", "some_spec_class", infer_nanoarrow_ptype_extension.some_spec_class ) expect_identical( infer_nanoarrow_ptype( na_extension(na_struct(list(some_name = na_int32())), "some_ext") ), data.frame(some_name = integer()) ) }) test_that("convert_array() dispatches on registered extension spec", { register_nanoarrow_extension( "some_ext", nanoarrow_extension_spec(subclass = "some_spec_class") ) on.exit(unregister_nanoarrow_extension("some_ext")) convert_array_extension.some_spec_class <- function(spec, array, to, ...) { convert_array_extension(NULL, array, to, ..., warn_unregistered = FALSE) } s3_register( "nanoarrow::convert_array_extension", "some_spec_class", convert_array_extension.some_spec_class ) expect_identical( convert_array( nanoarrow_extension_array(data.frame(some_name = 1:5), "some_ext") ), data.frame(some_name = 1:5) ) }) test_that("as_nanoarrow_array() dispatches on registered extension spec", { register_nanoarrow_extension( "some_ext", nanoarrow_extension_spec(subclass = "some_spec_class") ) on.exit(unregister_nanoarrow_extension("some_ext")) expect_error( as_nanoarrow_array( data.frame(some_name = 1:5), schema = na_extension( na_struct(list(some_name = na_int32())), "some_ext" ) ), "not implemented for extension" ) as_nanoarrow_array_extension.some_spec_class <- function(spec, x, ..., schema = NULL) { nanoarrow_extension_array(x, "some_ext") } s3_register( "nanoarrow::as_nanoarrow_array_extension", "some_spec_class", as_nanoarrow_array_extension.some_spec_class ) ext_array <- as_nanoarrow_array( data.frame(some_name = 1:5), schema = na_extension( na_struct(list(some_name = na_int32())), "some_ext" ) ) expect_identical( infer_nanoarrow_schema(ext_array)$metadata[["ARROW:extension:name"]], "some_ext" ) }) nanoarrow/tests/testthat/test-array.R0000644000176200001440000004015014502402506017476 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("nanoarrow_array format, print, and str methods work", { array <- as_nanoarrow_array(1:10) expect_identical(format(array), "") expect_output(expect_identical(str(array), array), "nanoarrow_array") expect_output(expect_identical(print(array), array), "nanoarrow_array") }) test_that("released nanoarrow_array format, print, and str methods work", { array <- nanoarrow_allocate_array() expect_identical(format(array), "") expect_output(expect_identical(str(array), array), "nanoarrow_array") expect_output(expect_identical(print(array), array), "nanoarrow_array") }) test_that("schemaless nanoarrow_array format, print, and str methods work", { array <- as_nanoarrow_array(1:10) nanoarrow_array_set_schema(array, NULL) expect_identical(format(array), "[10]>") expect_output(expect_identical(str(array), array), "nanoarrow_array") expect_output(expect_identical(print(array), array), "nanoarrow_array") }) test_that("as_nanoarrow_array() / convert_array() default method works", { array <- as_nanoarrow_array(1:10) expect_identical(convert_array(array), 1:10) array <- as_nanoarrow_array(as.double(1:10), schema = na_double()) expect_identical(convert_array(array), as.double(1:10)) }) test_that("infer_nanoarrow_schema() works for nanoarrow_array", { array <- as_nanoarrow_array(1:10) schema <- infer_nanoarrow_schema(array) expect_true(nanoarrow_schema_identical(schema, na_int32())) nanoarrow_array_set_schema(array, NULL) expect_error(infer_nanoarrow_schema(array), "has no associated schema") }) test_that("nanoarrow_array_set_schema() errors for invalid schema/array", { array <- as_nanoarrow_array(integer()) schema <- na_string() expect_error( nanoarrow_array_set_schema(array, schema), "Expected array with 3 buffer\\(s\\) but found 2 buffer\\(s\\)" ) }) test_that("as.vector() and as.data.frame() work for array", { array <- as_nanoarrow_array(1:10) expect_identical(as.vector(array), 1:10) struct_array <- as_nanoarrow_array(data.frame(a = 1:10)) expect_identical(as.data.frame(struct_array), data.frame(a = 1:10)) expect_error( as.data.frame(array), "Can't convert array with type int32 to data.frame" ) }) test_that("as_tibble() works for array()", { struct_array <- as_nanoarrow_array(data.frame(a = 1:10)) expect_identical(tibble::as_tibble(struct_array), tibble::tibble(a = 1:10)) }) test_that("schemaless array list interface works for non-nested types", { array <- as_nanoarrow_array(1:10) nanoarrow_array_set_schema(array, NULL) expect_identical(length(array), 6L) expect_identical( names(array), c("length", "null_count", "offset", "buffers", "children", "dictionary") ) expect_identical(array$length, 10L) expect_identical(array$null_count, 0L) expect_identical(array$offset, 0L) expect_length(array$buffers, 2L) expect_s3_class(array$buffers[[1]], "nanoarrow_buffer") expect_s3_class(array$buffers[[2]], "nanoarrow_buffer") expect_null(array$children) expect_null(array$dictionary) }) test_that("schemaless array list interface works for nested types", { array <- as_nanoarrow_array(data.frame(a = 1L, b = "two", stringsAsFactors = FALSE)) nanoarrow_array_set_schema(array, NULL) expect_length(array$children, 2L) expect_length(array$children[[1]]$buffers, 2L) expect_length(array$children[[2]]$buffers, 3L) expect_s3_class(array$children[[1]], "nanoarrow_array") expect_s3_class(array$children[[2]], "nanoarrow_array") info_recursive <- nanoarrow_array_proxy(array, recursive = TRUE) expect_type(info_recursive$children[[1]], "list") expect_length(info_recursive$children[[1]]$buffers, 2L) }) test_that("schemaless array list interface works for dictionary types", { array <- as_nanoarrow_array(factor(letters[1:5])) nanoarrow_array_set_schema(array, NULL) expect_length(array$buffers, 2L) expect_length(array$dictionary$buffers, 3L) expect_s3_class(array$dictionary, "nanoarrow_array") info_recursive <- nanoarrow_array_proxy_safe(array, recursive = TRUE) expect_type(info_recursive$dictionary, "list") expect_length(info_recursive$dictionary$buffers, 3L) }) test_that("array list interface classes data buffers for relevant types", { types <- list( int8 = na_int8(), uint8 = na_uint8(), int16 = na_int16(), uint16 = na_uint16(), int32 = na_int32(), uint32 = na_uint32(), int64 = na_int64(), uint64 = na_uint64(), half_float = na_half_float(), float = na_float(), double = na_double(), decimal128 = na_decimal128(2, 3), decimal256 = na_decimal256(2, 3) ) arrays <- lapply(types, function(x) nanoarrow_array_init(x)) for (nm in names(arrays)) { expect_identical(arrays[[!!nm]]$buffers[[1]]$type, "validity") expect_identical(arrays[[!!nm]]$buffers[[1]]$data_type, "bool") expect_identical(arrays[[!!nm]]$buffers[[2]]$type, "data") expect_identical(arrays[[!!nm]]$buffers[[2]]$data_type, nm) } }) test_that("array list interface classes offset buffers for relevant types", { arr_string <- nanoarrow_array_init(na_string()) expect_identical(arr_string$buffers[[2]]$type, "data_offset") expect_identical(arr_string$buffers[[2]]$data_type, "int32") expect_identical(arr_string$buffers[[3]]$type, "data") expect_identical(arr_string$buffers[[3]]$data_type, "string") arr_large_string <- nanoarrow_array_init(na_large_string()) expect_identical(arr_large_string$buffers[[2]]$type, "data_offset") expect_identical(arr_large_string$buffers[[2]]$data_type, "int64") arr_binary <- nanoarrow_array_init(na_binary()) expect_identical(arr_binary$buffers[[2]]$type, "data_offset") expect_identical(arr_binary$buffers[[2]]$data_type, "int32") arr_large_binary <- nanoarrow_array_init(na_large_binary()) expect_identical(arr_large_binary$buffers[[2]]$type, "data_offset") expect_identical(arr_large_binary$buffers[[2]]$data_type, "int64") }) test_that("array list interface works for nested types", { array <- as_nanoarrow_array(data.frame(a = 1L, b = "two", stringsAsFactors = FALSE)) expect_named(array$children, c("a", "b")) expect_s3_class(array$children[[1]], "nanoarrow_array") expect_s3_class(infer_nanoarrow_schema(array$children[[1]]), "nanoarrow_schema") info_recursive <- nanoarrow_array_proxy_safe(array, recursive = TRUE) expect_type(info_recursive$children, "list") expect_identical( info_recursive$children$a$buffers[[2]]$type, "data" ) expect_identical( info_recursive$children$b$buffers[[2]]$type, "data_offset" ) }) test_that("array list interface works for dictionary types", { array <- as_nanoarrow_array(factor(letters[1:5])) expect_identical(array$buffers[[2]]$type, "data") expect_identical(array$dictionary$buffers[[2]]$type, "data_offset") info_recursive <- nanoarrow_array_proxy_safe(array, recursive = TRUE) expect_type(info_recursive$dictionary, "list") expect_identical(info_recursive$dictionary$buffers[[2]]$type, "data_offset") }) test_that("array modify errors for invalid components", { array <- as_nanoarrow_array(1:5) expect_error( nanoarrow_array_modify(array, list(1, 2, 3)), "`new_values`" ) expect_error( nanoarrow_array_modify(array, list(not_an_item = NULL)), "Can't modify array" ) }) test_that("array modify does not copy if length(new_values) == 0", { array <- as_nanoarrow_array(1:5) expect_identical( nanoarrow_pointer_addr_chr(nanoarrow_array_modify(array, list())), nanoarrow_pointer_addr_chr(array) ) }) test_that("array modify can modify length", { array <- as_nanoarrow_array(1:5) array2 <- nanoarrow_array_modify(array, list(length = 4)) expect_identical(convert_array(array2), 1:4) expect_identical(array$length, 5L) expect_error( nanoarrow_array_modify(array, list(length = NULL)), "array\\$length must be double" ) expect_error( nanoarrow_array_modify(array, list(length = NA_real_)), "array\\$length must be finite" ) expect_error( nanoarrow_array_modify(array, list(length = -1)), "array\\$length must be finite and greater than zero" ) }) test_that("array modify can modify null_count", { array <- as_nanoarrow_array(c(1L, NA, 2L, NA, 3L)) array2 <- nanoarrow_array_modify(array, list(null_count = -1)) expect_identical(array2$null_count, -1L) expect_identical(array$null_count, 2L) expect_error( nanoarrow_array_modify(array, list(null_count = NULL)), "array\\$null_count must be double" ) expect_error( nanoarrow_array_modify(array, list(null_count = NA_real_)), "array\\$null_count must be finite" ) expect_error( nanoarrow_array_modify(array, list(null_count = -2)), "array\\$null_count must be finite and greater than -1" ) }) test_that("array modify can modify offset", { array <- as_nanoarrow_array(1:5) array2 <- nanoarrow_array_modify(array, list(length = 4, offset = 1)) expect_identical(convert_array(array2), 2:5) expect_identical(array$length, 5L) expect_error( nanoarrow_array_modify(array, list(offset = NULL)), "array\\$offset must be double" ) expect_error( nanoarrow_array_modify(array, list(offset = NA_real_)), "array\\$offset must be finite" ) expect_error( nanoarrow_array_modify(array, list(offset = -1)), "array\\$offset must be finite and greater than zero" ) }) test_that("array modify can modify buffers", { array <- as_nanoarrow_array(1:5) # Replace with brand new buffer array2 <- nanoarrow_array_modify(array, list(buffers = list(NULL, 6:10))) expect_identical(convert_array(array2), 6:10) expect_identical(convert_array(array), 1:5) # Re-use buffers from another array array_with_nulls <- as_nanoarrow_array(c(1L, NA, 2L, NA, 3L)) array2 <- nanoarrow_array_modify( array, list( null_count = -1, buffers = list( array_with_nulls$buffers[[1]], array$buffers[[2]] ) ) ) expect_identical(convert_array(array2), c(1L, NA, 3L, NA, 5L)) expect_identical(convert_array(array), 1:5) expect_identical(convert_array(array_with_nulls), c(1L, NA, 2L, NA, 3L)) # Should work even after the source arrays go out of scope array <- NULL array_with_nulls <- NULL gc() expect_identical(convert_array(array2), c(1L, NA, 3L, NA, 5L)) array <- as_nanoarrow_array(1:5) expect_error( nanoarrow_array_modify(array, list(buffers = rep(list(NULL), 4))), "must be <= 3" ) # Check that specifying too few buffers will result in a validation error expect_error( nanoarrow_array_modify(array, list(buffers = list()), validate = TRUE), "Expected 2 buffer" ) }) test_that("array modify checks buffer sizes", { array <- as_nanoarrow_array(1:5) expect_error( nanoarrow_array_modify(array, list(length = 6)), ">= 24 bytes but found buffer with 20 bytes" ) }) test_that("array modify can modify children", { array_with_children <- as_nanoarrow_array(data.frame(x = 1L)) # Children -> no children array2 <- nanoarrow_array_modify(array_with_children, list(children = NULL)) expect_identical( convert_array(array2), new_data_frame(setNames(list(), character()), nrow = 1L) ) # No children -> no children array_without_children <- array2 array2 <- nanoarrow_array_modify(array_with_children, list(children = NULL)) expect_identical( convert_array(array2), new_data_frame(setNames(list(), character()), nrow = 1L) ) # No children -> children array2 <- nanoarrow_array_modify( array_without_children, list(children = list(y = 2L)) ) expect_identical(convert_array(array2), data.frame(y = 2L)) # Replace same number of children array2 <- nanoarrow_array_modify( array_with_children, list(children = list(y = 2L)) ) expect_identical(convert_array(array2), data.frame(y = 2L)) }) test_that("array modify can modify dictionary", { array_without_dictionary <- as_nanoarrow_array(0L) array_with_dictionary <- as_nanoarrow_array(factor("a")) # No dictionary -> no dictionary array2 <- nanoarrow_array_modify( array_without_dictionary, list(dictionary = NULL) ) expect_identical(convert_array(array2), 0L) # No dictionary -> dictionary array2 <- nanoarrow_array_modify( array_without_dictionary, list(dictionary = "a") ) expect_identical(convert_array(array2$dictionary), "a") # Dictionary -> no dictionary array2 <- nanoarrow_array_modify( array_with_dictionary, list(dictionary = NULL) ) expect_identical(convert_array(array2), 0L) # Dictionary -> new dictionary array2 <- nanoarrow_array_modify( array_with_dictionary, list(dictionary = "b") ) expect_identical(convert_array(array2$dictionary), "b") }) test_that("array modify can modify array with no schema attached", { array <- as_nanoarrow_array(1L) nanoarrow_array_set_schema(array, NULL) array2 <- nanoarrow_array_modify(array, list(dictionary = c("a", "b"))) expect_true(!is.null(array2$dictionary)) array2 <- nanoarrow_array_modify(array, list(children = list("x"))) expect_length(array2$children, 1) }) test_that("array modify can skip validation", { array <- as_nanoarrow_array(1L) expect_error( nanoarrow_array_modify(array, list(children = list("x")), validate = TRUE), "Expected schema with 0 children" ) array2 <- nanoarrow_array_modify( array, list(children = list("x")), validate = FALSE ) expect_length(array2$children, 1) }) test_that("[[<- works for array", { array <- as_nanoarrow_array(1L) array[["length"]] <- 0 expect_identical(array$length, 0L) array <- as_nanoarrow_array(1L) array[[1]] <- 0 expect_identical(array$length, 0L) expect_error( array[["not_an_item"]] <- "something", "Can't modify array" ) expect_error( array[[NA_character_]] <- "something", "must be character" ) expect_error( array[[character()]] <- "something", "must be character" ) expect_error( array[[NA_integer_]] <- "something", "must be character" ) expect_error( array[[integer()]] <- "something", "must be character" ) expect_error( array[[12]] <- "something", "must be character" ) }) test_that("$<- works for array", { array <- as_nanoarrow_array(1L) array$length <- 0 expect_identical(array$length, 0L) expect_error( array$not_an_item <- "something", "Can't modify array" ) }) test_that("<- assignment works for array$children", { array <- as_nanoarrow_array( data.frame(col1 = 1L, col2 = "a", stringsAsFactors = FALSE) ) array$children$col1 <- 100 expect_identical( convert_array(array), data.frame(col1 = 100, col2 = "a", stringsAsFactors = FALSE) ) names(array$children)[1] <- "col1_new" expect_identical( convert_array(array), data.frame(col1_new = 100, col2 = "a", stringsAsFactors = FALSE) ) }) test_that("<- assignment works for array$buffers", { array <- as_nanoarrow_array(c(1:7, NA)) array$null_count <- -1 array$buffers[[1]] <- packBits(c(TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE)) expect_identical( convert_array(array), c(1:4, rep(NA, 4)) ) }) test_that("nanoarrow_array_init() creates an array", { array <- nanoarrow_array_init(na_int32()) expect_identical(convert_array(array), integer()) # Check error from init bad_schema <- nanoarrow_schema_modify( na_int32(), list(children = list(na_int32())), validate = FALSE ) expect_error( nanoarrow_array_init(bad_schema), "Expected schema with 0 children" ) }) nanoarrow/tests/testthat/test-infer-ptype.R0000644000176200001440000000766114502402562020636 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("infer_nanoarrow_ptype() works on arrays, schemas, and streams", { array <- as_nanoarrow_array(logical()) expect_identical(infer_nanoarrow_ptype(array), logical()) schema <- infer_nanoarrow_schema(array) expect_identical(infer_nanoarrow_ptype(schema), logical()) stream <- as_nanoarrow_array_stream(data.frame(x = logical())) expect_identical(infer_nanoarrow_ptype(stream), data.frame(x = logical())) expect_error( infer_nanoarrow_ptype("not valid"), "must be a nanoarrow_schema" ) }) test_that("infer_nanoarrow_ptype() works for basic types", { expect_identical( infer_nanoarrow_ptype(as_nanoarrow_array(vctrs::unspecified())), vctrs::unspecified() ) expect_identical( infer_nanoarrow_ptype(as_nanoarrow_array(logical())), logical() ) expect_identical( infer_nanoarrow_ptype(as_nanoarrow_array(integer())), integer() ) expect_identical( infer_nanoarrow_ptype(as_nanoarrow_array(double())), double() ) expect_identical( infer_nanoarrow_ptype(as_nanoarrow_schema(na_decimal128(2, 3))), double() ) expect_identical( infer_nanoarrow_ptype(as_nanoarrow_array(character())), character() ) expect_identical( infer_nanoarrow_ptype( as_nanoarrow_array(data.frame(x = character(), stringsAsFactors = FALSE)) ), data.frame(x = character(), stringsAsFactors = FALSE) ) }) test_that("infer_nanoarrow_ptype() infers ptypes for date/time types", { array_date <- as_nanoarrow_array(as.Date("2000-01-01")) expect_identical( infer_nanoarrow_ptype(array_date), as.Date(character()) ) array_time <- as_nanoarrow_array(hms::parse_hm("12:34")) expect_identical( infer_nanoarrow_ptype(array_time), hms::hms() ) array_duration <- as_nanoarrow_array(as.difftime(123, units = "secs")) expect_identical( infer_nanoarrow_ptype(array_duration), as.difftime(numeric(), units = "secs") ) array_timestamp <- as_nanoarrow_array( as.POSIXct("2000-01-01 12:33", tz = "America/Halifax") ) expect_identical( infer_nanoarrow_ptype(array_timestamp), as.POSIXct(character(), tz = "America/Halifax") ) }) test_that("infer_nanoarrow_ptype() infers ptypes for nested types", { skip_if_not_installed("arrow") array_list <- as_nanoarrow_array(vctrs::list_of(integer())) expect_identical( infer_nanoarrow_ptype(array_list), vctrs::list_of(.ptype = integer()) ) array_fixed_size <- as_nanoarrow_array( arrow::Array$create( list(1:5), arrow::fixed_size_list_of(arrow::int32(), 5) ) ) expect_identical( infer_nanoarrow_ptype(array_fixed_size), vctrs::list_of(.ptype = integer()) ) }) test_that("infer_nanoarrow_ptype() errors for types it can't infer", { unsupported_array <- nanoarrow_array_init(na_decimal256(3, 4)) expect_error( infer_nanoarrow_ptype(as_nanoarrow_array(unsupported_array)), "Can't infer R vector type for " ) unsupported_struct <- nanoarrow_array_init( na_struct(list(col = na_decimal256(3, 4))) ) expect_error( infer_nanoarrow_ptype(as_nanoarrow_array(unsupported_struct)), "Can't infer R vector type for `col` " ) }) nanoarrow/tests/testthat/test-pointers.R0000644000176200001440000001767114502402506020237 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("nanoarrow_pointer_is_valid() works", { expect_true(nanoarrow_pointer_is_valid(na_int32())) expect_true(nanoarrow_pointer_is_valid(as_nanoarrow_array(integer()))) expect_true(nanoarrow_pointer_is_valid( as_nanoarrow_array_stream(data.frame(a = integer()))) ) expect_false(nanoarrow_pointer_is_valid(nanoarrow_allocate_schema())) expect_false(nanoarrow_pointer_is_valid(nanoarrow_allocate_array())) expect_false(nanoarrow_pointer_is_valid(nanoarrow_allocate_array_stream())) expect_error(nanoarrow_pointer_is_valid(NULL), "must inherit from") }) test_that("nanoarrow_pointer_release() works", { ptr <- na_int32() expect_true(nanoarrow_pointer_is_valid(ptr)) nanoarrow_pointer_release(ptr) expect_false(nanoarrow_pointer_is_valid(ptr)) ptr <- as_nanoarrow_array(integer()) expect_true(nanoarrow_pointer_is_valid(ptr)) nanoarrow_pointer_release(ptr) expect_false(nanoarrow_pointer_is_valid(ptr)) ptr <- as_nanoarrow_array_stream(data.frame(a = integer())) expect_true(nanoarrow_pointer_is_valid(ptr)) nanoarrow_pointer_release(ptr) expect_false(nanoarrow_pointer_is_valid(ptr)) expect_error(nanoarrow_pointer_release(NULL), "must inherit from") }) test_that("nanoarrow_pointer_move() works for schema", { ptr <- na_int32() dst <- nanoarrow_allocate_schema() nanoarrow_pointer_move(ptr, dst) expect_false(nanoarrow_pointer_is_valid(ptr)) expect_true(nanoarrow_schema_identical(dst, na_int32())) expect_error( nanoarrow_pointer_move(ptr, dst), "`ptr_dst` is a valid struct ArrowSchema" ) expect_error( nanoarrow_pointer_move(nanoarrow_allocate_schema(), ptr), "`ptr_src` is not a valid struct ArrowSchema" ) }) test_that("nanoarrow_pointer_move() works for array", { ptr <- as_nanoarrow_array(integer()) dst <- nanoarrow_allocate_array() nanoarrow_pointer_move(ptr, dst) expect_false(nanoarrow_pointer_is_valid(ptr)) expect_identical(convert_array(dst), integer()) expect_error( nanoarrow_pointer_move(ptr, dst), "`ptr_dst` is a valid struct ArrowArray" ) expect_error( nanoarrow_pointer_move(nanoarrow_allocate_array(), ptr), "`ptr_src` is not a valid struct ArrowArray" ) }) test_that("nanoarrow_pointer_move() works for array_stream", { ptr <- as_nanoarrow_array_stream(data.frame(a = integer())) dst <- nanoarrow_allocate_array_stream() nanoarrow_pointer_move(ptr, dst) expect_false(nanoarrow_pointer_is_valid(ptr)) expect_true(nanoarrow_pointer_is_valid(dst)) expect_error( nanoarrow_pointer_move(ptr, dst), "`ptr_dst` is a valid struct ArrowArrayStream" ) expect_error( nanoarrow_pointer_move(nanoarrow_allocate_array_stream(), ptr), "`ptr_src` is not a valid struct ArrowArrayStream" ) }) test_that("nanoarrow_pointer_move() can import from chr address", { ptr <- na_int32() ptr_chr <- nanoarrow_pointer_addr_chr(ptr) dst <- nanoarrow_allocate_schema() nanoarrow_pointer_move(ptr_chr, dst) expect_false(nanoarrow_pointer_is_valid(ptr)) expect_true(nanoarrow_pointer_is_valid(dst)) }) test_that("nanoarrow_pointer_move() can import from dbl address", { ptr <- na_int32() ptr_dbl <- nanoarrow_pointer_addr_dbl(ptr) dst <- nanoarrow_allocate_schema() nanoarrow_pointer_move(ptr_dbl, dst) expect_false(nanoarrow_pointer_is_valid(ptr)) expect_true(nanoarrow_pointer_is_valid(dst)) }) test_that("nanoarrow_pointer_move() errors for bad input", { ptr <- na_int32() dst <- nanoarrow_allocate_schema() expect_error(nanoarrow_pointer_move(ptr, NULL), "`ptr_dst` must inherit from") expect_error( nanoarrow_pointer_move(NULL, dst), "Pointer must be chr\\[1\\], dbl\\[1\\], or external pointer" ) }) test_that("nanoarrow_pointer_export() works for schema", { ptr <- na_int32() dst <- nanoarrow_allocate_schema() nanoarrow_pointer_export(ptr, dst) expect_true(nanoarrow_pointer_is_valid(ptr)) expect_true(nanoarrow_schema_identical(dst, na_int32())) expect_error( nanoarrow_pointer_export(ptr, dst), "`ptr_dst` is a valid struct ArrowSchema" ) expect_error( nanoarrow_pointer_export(nanoarrow_allocate_schema(), nanoarrow_allocate_schema()), "has already been released" ) }) test_that("nanoarrow_pointer_export() works for array", { ptr <- as_nanoarrow_array(integer()) dst <- nanoarrow_allocate_array() nanoarrow_pointer_export(ptr, dst) expect_true(nanoarrow_pointer_is_valid(ptr)) # (when exporting the schema is not included) nanoarrow_array_set_schema(dst, infer_nanoarrow_schema(ptr)) expect_identical(convert_array(dst), integer()) expect_error( nanoarrow_pointer_export(ptr, dst), "`ptr_dst` is a valid struct ArrowArray" ) expect_error( nanoarrow_pointer_export(nanoarrow_allocate_array(), nanoarrow_allocate_array()), "has already been released" ) }) test_that("exported Arrays can have their children released", { ptr <- as_nanoarrow_array(data.frame(a = 1L, b = 2)) dst <- nanoarrow_allocate_array() nanoarrow_pointer_export(ptr, dst) expect_identical(convert_array(ptr), data.frame(a = 1L, b = 2)) nanoarrow_pointer_release(dst$children[[1]]) expect_identical(convert_array(ptr), data.frame(a = 1L, b = 2)) nanoarrow_pointer_release(dst$children[[2]]) expect_identical(convert_array(ptr), data.frame(a = 1L, b = 2)) nanoarrow_pointer_release(dst) expect_identical(convert_array(ptr), data.frame(a = 1L, b = 2)) }) test_that("nanoarrow_pointer_export() works for array_stream", { ptr <- as_nanoarrow_array_stream(data.frame(a = integer())) dst <- nanoarrow_allocate_array_stream() nanoarrow_pointer_export(ptr, dst) expect_false(nanoarrow_pointer_is_valid(ptr)) expect_true(nanoarrow_pointer_is_valid(dst)) expect_identical(convert_array_stream(dst), data.frame(a = integer())) expect_error( nanoarrow_pointer_export(ptr, dst), "`ptr_dst` is a valid struct ArrowArrayStream" ) expect_error( nanoarrow_pointer_export(nanoarrow_allocate_array_stream(), ptr), "has already been released" ) }) test_that("nanoarrow_pointer_export() works for wrapped array_stream", { some_dependent_object <- list() ptr <- as_nanoarrow_array_stream(data.frame(a = integer())) nanoarrow_pointer_set_protected(ptr, some_dependent_object) dst <- nanoarrow_allocate_array_stream() nanoarrow_pointer_export(ptr, dst) expect_false(nanoarrow_pointer_is_valid(ptr)) expect_true(nanoarrow_pointer_is_valid(dst)) expect_identical(convert_array_stream(dst), data.frame(a = integer())) }) test_that("nanoarrow_pointer_set_protected() errors appropriately", { expect_error( nanoarrow_pointer_set_protected(NULL), "must inherit from 'nanoarrow_schema', 'nanoarrow_array', or 'nanoarrow_array_stream'" ) dst <- nanoarrow_allocate_array_stream() nanoarrow_pointer_set_protected(dst, 1234) expect_error( nanoarrow_pointer_set_protected(dst, 5678), "External pointer protected value has already been set" ) }) test_that("nanoarrow_pointer_export() errors for unknown object", { expect_error(nanoarrow_pointer_export(NULL), "must inherit from") }) test_that("pointer address getters work", { schema <- na_int32() expect_match(nanoarrow_pointer_addr_chr(schema), "^[0-9]+$") expect_match(nanoarrow_pointer_addr_pretty(schema), "^(0x)?[0-9a-fA-F]+$") }) nanoarrow/tests/testthat/_snaps/0000755000176200001440000000000014547061553016557 5ustar liggesusersnanoarrow/tests/testthat/_snaps/buffer.md0000644000176200001440000000136014554501064020344 0ustar liggesusers# buffers can be printed Code str(as_nanoarrow_buffer(1:10)) Output [10][40 b]> `1 2 3 4 5 6 7 8 9 10` --- Code str(as_nanoarrow_buffer(1:10000)) Output [10000][40000 b]> `1 2 3 4 5 6 7 8 9 10 11 12 1...` --- Code str(as_nanoarrow_buffer(strrep("abcdefg", 100))) Output [700 b]> `abcdefgabcdefgabcdefgabcdefgabcdefga...` --- Code str(as_nanoarrow_buffer(charToRaw(strrep("abcdefg", 100)))) Output [700 b]> `61 62 63 64 65 66 67 61 62 63 64 65 ...` --- Code str(array$buffers[[2]]) Output > nanoarrow/tests/testthat/_snaps/array-stream.md0000644000176200001440000000027114554501063021501 0ustar liggesusers# as_nanoarrow_array_stream() works for nanoarow_array_stream is.null(schema) is not TRUE # as_nanoarrow_array_stream() works for nanoarow_array is.null(schema) is not TRUE nanoarrow/tests/testthat/_snaps/as-array.md0000644000176200001440000000051214554501064020610 0ustar liggesusers# as_nanoarrow_array() errors for bad logical() creation Invalid: Expecting a character vector # as_nanoarrow_array() errors for bad data.frame() -> na_struct() Can't create Array from object of type data.frame # as_nanoarrow_array() works for bad unspecified() create NotImplemented: day_time_interval nanoarrow/tests/testthat/test-util.R0000644000176200001440000000540114502402562017337 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("can set option/env var to pretend the arrow package is not installed", { skip_if_not_installed("arrow") expect_true(arrow_installed()) expect_silent(assert_arrow_installed("life")) withr::with_options(list(nanoarrow.without_arrow = TRUE), { expect_false(arrow_installed()) expect_error( assert_arrow_installed("life"), "Package 'arrow' required for life" ) }) withr::with_envvar(list(R_NANOARROW_WITHOUT_ARROW = "true"), { expect_false(arrow_installed()) }) }) test_that("preserve/release works when release happens on another thread", { some_non_null_sexp <- 1L preserved_empty() expect_identical(preserved_empty(), 0) preserve_and_release_on_other_thread(some_non_null_sexp) # We can't test the exact value of preserved_count() because what the # garbage collector releases and when is not stable. expect_true(preserved_count() > 0) expect_identical(preserved_empty(), 1) expect_identical(preserved_empty(), 0) }) test_that("vector slicer works", { expect_identical(vec_slice2(letters, 1), "a") expect_identical( vec_slice2(data.frame(letters = letters, stringsAsFactors = FALSE), 1), data.frame(letters = "a", stringsAsFactors = FALSE) ) }) test_that("new_data_frame() works", { expect_identical( new_data_frame(list(x = 1, y = 2), nrow = 1), data.frame(x = 1, y = 2) ) }) test_that("vector fuzzers work", { ptype <- data.frame( a = logical(), b = integer(), c = double(), d = character(), stringsAsFactors = FALSE ) df_gen <- vec_gen(ptype, n = 123) expect_identical(nrow(df_gen), 123L) expect_identical(df_gen[integer(), ], ptype) expect_error(vec_gen(environment()), "Don't know how to generate vector") }) test_that("vector shuffler works", { df <- data.frame(letters = letters, stringsAsFactors = FALSE) df_shuffled <- vec_shuffle(df) expect_setequal(df_shuffled$letters, df$letters) letters_shuffled <- vec_shuffle(letters) expect_setequal(letters_shuffled, letters) }) nanoarrow/tests/testthat/test-array-stream.R0000644000176200001440000002062614547575511021015 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("basic_array_stream() can create empty streams", { stream <- basic_array_stream(list(), na_int32()) expect_identical(stream$get_schema()$format, "i") expect_null(stream$get_next()) expect_error( basic_array_stream(list()), "Can't infer schema from first batch if there are zero batches" ) }) test_that("basic_array_stream() can create streams from batches", { stream <- basic_array_stream( list( data.frame(a = 1, b = "two", stringsAsFactors = FALSE), data.frame(a = 2, b = "three", stringsAsFactors = FALSE) ) ) expect_identical(stream$get_schema()$format, "+s") expect_identical( as.data.frame(stream$get_next()), data.frame(a = 1, b = "two", stringsAsFactors = FALSE) ) expect_identical( as.data.frame(stream$get_next()), data.frame(a = 2, b = "three", stringsAsFactors = FALSE) ) expect_null(stream$get_next()) }) test_that("basic_array_stream() can validate input or skip validation", { invalid_stream <- basic_array_stream( list( as_nanoarrow_array(1:5), as_nanoarrow_array(data.frame(a = 1:5)) ), validate = FALSE ) expect_s3_class(invalid_stream, "nanoarrow_array_stream") expect_error( basic_array_stream( list( as_nanoarrow_array(1:5), as_nanoarrow_array(data.frame(a = 1:5)) ), validate = TRUE ), "Expected array with 2 buffer" ) }) test_that("nanoarrow_array_stream format, print, and str methods work", { array_stream <- as_nanoarrow_array_stream(data.frame(x = 1:10)) expect_identical(format(array_stream), ">") expect_output(expect_identical(str(array_stream), array_stream), "nanoarrow_array_stream") expect_output(expect_identical(print(array_stream), array_stream), "nanoarrow_array_stream") }) test_that("released nanoarrow_array_stream format, print, and str methods work", { array_stream <- nanoarrow_allocate_array_stream() expect_identical(format(array_stream), "") expect_output(expect_identical(str(array_stream), array_stream), "nanoarrow_array_stream") expect_output(expect_identical(print(array_stream), array_stream), "nanoarrow_array_stream") }) test_that("as_nanoarrow_array_stream() works for nanoarow_array_stream", { stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) expect_identical(as_nanoarrow_array_stream(stream), stream) stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) expect_identical( as_nanoarrow_array_stream(stream, schema = na_struct(list(x = na_int32()))), stream ) skip_if_not_installed("arrow") expect_snapshot_error( as_nanoarrow_array_stream(stream, schema = na_struct(list(x = na_double()))) ) }) test_that("as_nanoarrow_array_stream() works for nanoarow_array", { array <- as_nanoarrow_array(data.frame(x = 1:5)) stream <- as_nanoarrow_array_stream(array) expect_identical(infer_nanoarrow_schema(stream)$format, "+s") expect_identical( lapply(collect_array_stream(stream), as.data.frame), list(data.frame(x = 1:5)) ) # With explicit but identical schema stream <- as_nanoarrow_array_stream(array, schema = na_struct(list(x = na_int32()))) expect_identical(infer_nanoarrow_schema(stream)$format, "+s") expect_identical( lapply(collect_array_stream(stream), as.data.frame), list(data.frame(x = 1:5)) ) # With schema requiring a cast (not implemented in arrow) skip_if_not_installed("arrow") expect_snapshot_error( as_nanoarrow_array_stream(array, schema = na_struct(list(x = na_double()))) ) }) test_that("infer_nanoarrow_schema() is implemented for streams", { stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) schema <- infer_nanoarrow_schema(stream) expect_identical(schema$children$x$format, "i") }) test_that("as.data.frame() is implemented for streams", { stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) expect_identical( as.data.frame(stream), data.frame(x = 1:5) ) expect_false(nanoarrow_pointer_is_valid(stream)) }) test_that("as.vector() is implemented for streams", { stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) expect_identical( as.vector(stream), data.frame(x = 1:5) ) expect_false(nanoarrow_pointer_is_valid(stream)) }) test_that("nanoarrow_array_stream list interface works", { stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) expect_identical(length(stream), 3L) expect_identical(names(stream), c("get_schema", "get_next", "release")) expect_identical(formals(stream[["get_schema"]]), formals(stream$get_schema)) expect_identical(formals(stream[["get_next"]]), formals(stream$get_next)) expect_identical(formals(stream[["release"]]), formals(stream$release)) expect_null(stream[["this key does not exist"]]) }) test_that("nanoarrow_array_stream can get_schema() and get_next()", { stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) expect_identical(stream$get_schema()$format, "+s") expect_identical(as.data.frame(stream$get_next()), data.frame(x = 1:5)) expect_null(stream$get_next()) }) test_that("nanoarrow_array_stream can release()", { stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) expect_true(nanoarrow_pointer_is_valid(stream)) stream$release() expect_false(nanoarrow_pointer_is_valid(stream)) }) test_that("nanoarrow_array_stream can validate or not on get_next()", { stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) expect_error( stream$get_next(schema = na_int32()), "Expected array with 2 buffer" ) stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) expect_silent( stream$get_next( schema = na_int32(), validate = FALSE ) ) }) test_that("nanoarrow_array_stream get_next() with schema = NULL", { stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) array <- stream$get_next(schema = NULL) expect_error(infer_nanoarrow_schema(array), "has no associated schema") }) test_that("User array stream finalizers are run on explicit release", { stream <- basic_array_stream(list(1:5)) stream <- array_stream_set_finalizer(stream, function() cat("All done!")) expect_output(stream$release(), "All done!") expect_silent(stream$release()) }) test_that("User array stream finalizers are run on explicit release even when moved", { stream <- basic_array_stream(list(1:5)) stream <- array_stream_set_finalizer(stream, function() cat("All done!")) stream2 <- nanoarrow_allocate_array_stream() nanoarrow_pointer_move(stream, stream2) expect_false(nanoarrow_pointer_is_valid(stream)) expect_silent(nanoarrow_pointer_release(stream)) expect_output(stream2$release(), "All done!") expect_silent(stream2$release()) }) test_that("User array stream finalizers are run on explicit release even when exported", { stream <- basic_array_stream(list(1:5)) stream <- array_stream_set_finalizer(stream, function() cat("All done!")) stream2 <- nanoarrow_allocate_array_stream() nanoarrow_pointer_export(stream, stream2) expect_false(nanoarrow_pointer_is_valid(stream)) expect_silent(nanoarrow_pointer_release(stream)) expect_output(stream2$release(), "All done!") expect_silent(stream2$release()) }) test_that("Errors from user array stream finalizer are ignored", { stream <- basic_array_stream(list(1:5)) stream <- array_stream_set_finalizer(stream, function() stop("Error that will be ignored")) # Because this comes from REprintf(), it's not a message and not "output" # according to testthat, so we use capture.output() expect_identical( capture.output(stream$release(), type = "message"), "Error evaluating user-supplied array stream finalizer" ) expect_false(nanoarrow_pointer_is_valid(stream)) expect_silent(stream$release()) }) nanoarrow/tests/testthat/test-convert-array.R0000644000176200001440000006666414547061553021212 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("convert_array() errors for invalid arrays", { array <- as_nanoarrow_array(1:10) nanoarrow_array_set_schema( array, na_string(), validate = FALSE ) expect_error( convert_array(array), "Expected array with 3 buffer" ) }) test_that("convert_array() errors for unsupported ptype", { array <- as_nanoarrow_array(1:10) # an S3 unsupported type expect_error( convert_array(array, structure(list(), class = "some_class")), "Can't convert array to R vector of type some_class" ) # A non-S3 unsupported type expect_error( convert_array(array, environment()), "Can't convert array to R vector of type environment" ) # An array with a name to an unsupported type struct_array <- as_nanoarrow_array(data.frame(x = 1L)) expect_error( convert_array(struct_array$children$x, environment()), "Can't convert `x`" ) }) test_that("convert_array() errors for unsupported array", { unsupported_array <- nanoarrow_array_init(na_interval_day_time()) expect_error( convert_array(as_nanoarrow_array(unsupported_array)), "Can't infer R vector type for " ) }) test_that("convert to vector works for data.frame", { df <- data.frame(a = 1L, b = "two", c = 3, d = TRUE, stringsAsFactors = FALSE) array <- as_nanoarrow_array(df) expect_identical(convert_array(array, NULL), df) expect_identical(convert_array(array, df), df) expect_error( convert_array(array, data.frame(a = integer(), b = raw())), "Expected data.frame\\(\\) ptype with 4 column\\(s\\) but found 2 column\\(s\\)" ) bad_ptype <- data.frame(a = integer(), b = raw(), c = double(), d = integer()) expect_error( convert_array(array, bad_ptype), "Can't convert `b` to R vector of type raw" ) }) test_that("convert to vector works for partial_frame", { array <- as_nanoarrow_array( data.frame(a = 1L, b = "two", stringsAsFactors = FALSE) ) expect_identical( convert_array(array, vctrs::partial_frame()), data.frame(a = 1L, b = "two", stringsAsFactors = FALSE) ) }) test_that("convert to vector works for extension -> data.frame()", { array <- nanoarrow_extension_array( data.frame(x = c(TRUE, FALSE, NA, FALSE, TRUE)), "some_ext" ) expect_warning( expect_identical( convert_array(array, data.frame(x = logical())), data.frame(x = c(TRUE, FALSE, NA, FALSE, TRUE)) ), "Converting unknown extension" ) }) test_that("convert to vector works for dictionary -> data.frame()", { array <- as_nanoarrow_array(c(0L, 1L, 2L, 1L, 0L)) array$dictionary <- as_nanoarrow_array(data.frame(x = c(TRUE, FALSE, NA))) expect_identical( convert_array(array, data.frame(x = logical())), data.frame(x = c(TRUE, FALSE, NA, FALSE, TRUE)) ) }) test_that("convert to vector works for function()", { tibble_or_bust <- function(array, ptype) { if (is.data.frame(ptype)) { ptype <- tibble::as_tibble(ptype) ptype[] <- Map(tibble_or_bust, list(NULL), ptype) } ptype } df_nested_df <- as.data.frame( tibble::tibble(a = 1L, b = "two", c = data.frame(a = 3)) ) array_nested <- as_nanoarrow_array(df_nested_df) expect_identical( convert_array(array_nested, tibble_or_bust), tibble::tibble(a = 1L, b = "two", c = tibble::tibble(a = 3)) ) }) test_that("convert to vector works for tibble", { array <- as_nanoarrow_array( data.frame(a = 1L, b = "two", stringsAsFactors = FALSE) ) expect_identical( convert_array(array, tibble::tibble(a = integer(), b = character())), tibble::tibble(a = 1L, b = "two") ) # Check nested tibble at both levels tbl_nested_df <- tibble::tibble(a = 1L, b = "two", c = data.frame(a = 3)) array_nested <- as_nanoarrow_array(tbl_nested_df) expect_identical( convert_array(array_nested, tbl_nested_df), tbl_nested_df ) df_nested_tbl <- as.data.frame(tbl_nested_df) df_nested_tbl$c <- tibble::as_tibble(df_nested_tbl$c) expect_identical( convert_array(array_nested, df_nested_tbl), df_nested_tbl ) }) test_that("convert to vector works for struct-style vectors", { array <- as_nanoarrow_array(as.POSIXlt("2021-01-01", tz = "America/Halifax")) expect_identical( convert_array(array), as.data.frame( unclass(as.POSIXlt("2021-01-01", tz = "America/Halifax")), stringsAsFactors = FALSE ) ) array <- as_nanoarrow_array(as.POSIXlt("2021-01-01", tz = "America/Halifax")) expect_identical( convert_array(array, as.POSIXlt("2021-01-01", tz = "America/Halifax")), as.POSIXlt("2021-01-01", tz = "America/Halifax") ) }) test_that("convert to vector works for unspecified()", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 # implicit for null type expect_identical( convert_array(array, to = NULL), vctrs::vec_cast(rep(NA, 10), vctrs::unspecified()) ) # explicit for null type expect_identical( convert_array(array, vctrs::unspecified()), vctrs::vec_cast(rep(NA, 10), vctrs::unspecified()) ) # explicit for non-null type that is all NAs array <- as_nanoarrow_array(rep(NA_integer_, 10)) expect_identical( convert_array(array, vctrs::unspecified()), vctrs::vec_cast(rep(NA, 10), vctrs::unspecified()) ) # explicit for non-null type that is not all NAs array <- as_nanoarrow_array(c(1L, rep(NA_integer_, 9))) expect_warning( expect_identical( convert_array(array, vctrs::unspecified()), vctrs::vec_cast(rep(NA, 10), vctrs::unspecified()) ), class = "nanoarrow_warning_lossy_conversion" ) }) test_that("convert to vector works for valid logical()", { skip_if_not_installed("arrow") arrow_numeric_types <- list( int8 = arrow::int8(), uint8 = arrow::uint8(), int16 = arrow::int16(), uint16 = arrow::uint16(), int32 = arrow::int32(), uint32 = arrow::uint32(), int64 = arrow::int64(), uint64 = arrow::uint64(), float32 = arrow::float32(), float64 = arrow::float64() ) vals <- c(NA, 0:10) for (nm in names(arrow_numeric_types)) { expect_identical( convert_array( as_nanoarrow_array(vals, schema = arrow_numeric_types[[!!nm]]), logical() ), vals != 0 ) } vals_no_na <- 0:10 for (nm in names(arrow_numeric_types)) { expect_identical( convert_array( as_nanoarrow_array(vals_no_na, schema = arrow_numeric_types[[!!nm]]), logical() ), vals_no_na != 0 ) } # Boolean array to logical expect_identical( convert_array( as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()), logical() ), c(NA, TRUE, FALSE) ) expect_identical( convert_array( as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()), logical() ), c(TRUE, FALSE) ) }) test_that("convert to vector works for null -> logical()", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 expect_identical( convert_array(array, logical()), rep(NA, 10) ) }) test_that("convert to vector works for extension -> logical()", { array <- nanoarrow_extension_array(c(TRUE, FALSE, NA), "some_ext") expect_warning( expect_identical( convert_array(array, logical()), c(TRUE, FALSE, NA) ), "Converting unknown extension" ) }) test_that("convert to vector works for dictionary -> logical()", { array <- as_nanoarrow_array(c(0L, 1L, 2L, 1L, 0L)) array$dictionary <- as_nanoarrow_array(c(TRUE, FALSE, NA)) expect_identical( convert_array(array, logical()), c(TRUE, FALSE, NA, FALSE, TRUE) ) }) test_that("convert to vector errors for bad array to logical()", { expect_error( convert_array(as_nanoarrow_array(letters), logical()), "Can't convert array to R vector of type logical" ) }) test_that("convert to vector works for valid integer()", { skip_if_not_installed("arrow") arrow_int_types <- list( int8 = arrow::int8(), uint8 = arrow::uint8(), int16 = arrow::int16(), uint16 = arrow::uint16(), int32 = arrow::int32(), uint32 = arrow::uint32(), int64 = arrow::int64(), uint64 = arrow::uint64(), float32 = arrow::float32(), float64 = arrow::float64() ) ints <- c(NA, 0:10) for (nm in names(arrow_int_types)) { expect_identical( convert_array( as_nanoarrow_array(ints, schema = arrow_int_types[[!!nm]]), integer() ), ints ) } ints_no_na <- 0:10 for (nm in names(arrow_int_types)) { expect_identical( convert_array( as_nanoarrow_array(ints_no_na, schema = arrow_int_types[[!!nm]]), integer() ), ints_no_na ) } # Boolean array to integer expect_identical( convert_array( as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()), integer() ), c(NA, 1L, 0L) ) expect_identical( convert_array( as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()), integer() ), c(1L, 0L) ) }) test_that("convert to works for integer() -> character()", { skip_if_not_installed("arrow") arrow_int_types <- list( int8 = arrow::int8(), uint8 = arrow::uint8(), int16 = arrow::int16(), uint16 = arrow::uint16(), int32 = arrow::int32(), uint32 = arrow::uint32(), int64 = arrow::int64() ) ints <- c(NA, 0:10) for (nm in names(arrow_int_types)) { expect_identical( convert_array( as_nanoarrow_array(ints, schema = arrow_int_types[[!!nm]]), character() ), as.character(ints) ) } }) test_that("convert to vector works for null -> logical()", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 expect_identical( convert_array(array, integer()), rep(NA_integer_, 10) ) }) test_that("convert to vector works for extension -> integer()", { array <- nanoarrow_extension_array(c(0L, 1L, NA_integer_), "some_ext") expect_warning( expect_identical( convert_array(array, integer()), c(0L, 1L, NA_integer_) ), "Converting unknown extension" ) }) test_that("convert to vector warns for invalid integer()", { array <- as_nanoarrow_array(.Machine$integer.max + 1) expect_warning( expect_identical(convert_array(array, integer()), NA_integer_), class = "nanoarrow_warning_lossy_conversion" ) array <- as_nanoarrow_array(c(NA, .Machine$integer.max + 1)) expect_warning( expect_identical(convert_array(array, integer()), c(NA_integer_, NA_integer_)), class = "nanoarrow_warning_lossy_conversion" ) }) test_that("convert to vector errors for bad array to integer()", { expect_error( convert_array(as_nanoarrow_array(letters), integer()), "Can't convert array to R vector of type integer" ) }) test_that("convert to vector works for valid double()", { skip_if_not_installed("arrow") arrow_numeric_types <- list( int8 = arrow::int8(), uint8 = arrow::uint8(), int16 = arrow::int16(), uint16 = arrow::uint16(), int32 = arrow::int32(), uint32 = arrow::uint32(), int64 = arrow::int64(), uint64 = arrow::uint64(), float32 = arrow::float32(), float64 = arrow::float64() ) vals <- as.double(c(NA, 0:10)) for (nm in names(arrow_numeric_types)) { expect_identical( convert_array( as_nanoarrow_array(vals, schema = arrow_numeric_types[[!!nm]]), double() ), vals ) } vals_no_na <- as.double(0:10) for (nm in names(arrow_numeric_types)) { expect_identical( convert_array( as_nanoarrow_array(vals_no_na, schema = arrow_numeric_types[[!!nm]]), double() ), vals_no_na ) } # Boolean array to double expect_identical( convert_array( as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()), double() ), as.double(c(NA, 1L, 0L)) ) expect_identical( convert_array( as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()), double() ), as.double(c(1L, 0L)) ) }) test_that("convert to vector works for decimal128 -> double()", { skip_if_not_installed("arrow") array <- as_nanoarrow_array(arrow::Array$create(1:10)$cast(arrow::decimal128(20, 10))) # Check via S3 dispatch expect_equal( convert_array(array, double()), as.double(1:10) ) # ...and via C -> S3 dispatch expect_equal( convert_array.default(array, double()), as.double(1:10) ) }) test_that("convert to vector works for null -> double()", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 expect_identical( convert_array(array, double()), rep(NA_real_, 10) ) }) test_that("convert to vector works for extension -> double()", { array <- nanoarrow_extension_array(c(0, 1, NA_real_), "some_ext") expect_warning( expect_identical( convert_array(array, double()), c(0, 1, NA_real_) ), "Converting unknown extension" ) }) test_that("convert to vector works for dictionary -> double()", { array <- as_nanoarrow_array(c(0L, 1L, 2L, 1L, 0L)) array$dictionary <- as_nanoarrow_array(c(123, 0, NA_real_)) expect_identical( convert_array(array, double()), c(123, 0, NA_real_, 0, 123) ) }) test_that("convert to vector warns for possibly invalid double()", { array <- as_nanoarrow_array(2^54, schema = na_int64()) expect_warning( convert_array(array, double()), class = "nanoarrow_warning_lossy_conversion" ) }) test_that("convert to vector errors for bad array to double()", { expect_error( convert_array(as_nanoarrow_array(letters), double()), "Can't convert array to R vector of type numeric" ) }) test_that("convert to vector works for valid integer64()", { skip_if_not_installed("bit64") skip_if_not_installed("arrow") arrow_numeric_types <- list( int8 = arrow::int8(), uint8 = arrow::uint8(), int16 = arrow::int16(), uint16 = arrow::uint16(), int32 = arrow::int32(), uint32 = arrow::uint32(), int64 = arrow::int64(), uint64 = arrow::uint64(), float32 = arrow::float32(), float64 = arrow::float64() ) vals <- bit64::as.integer64(c(NA, 0:10)) for (nm in names(arrow_numeric_types)) { expect_identical( convert_array( as_nanoarrow_array(vals, schema = arrow_numeric_types[[!!nm]]), bit64::integer64() ), vals ) } vals_no_na <- bit64::as.integer64(0:10) for (nm in names(arrow_numeric_types)) { expect_identical( convert_array( as_nanoarrow_array(vals_no_na, schema = arrow_numeric_types[[!!nm]]), bit64::integer64() ), vals_no_na ) } # Boolean array to double expect_identical( convert_array( as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()), bit64::integer64() ), bit64::as.integer64(c(NA, 1L, 0L)) ) expect_identical( convert_array( as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()), bit64::integer64() ), bit64::as.integer64(c(1L, 0L)) ) }) test_that("convert to vector works for null -> integer64()", { skip_if_not_installed("bit64") array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 expect_identical( convert_array(array, bit64::integer64()), rep(bit64::NA_integer64_, 10) ) }) test_that("convert to vector works for extension -> integer64()", { skip_if_not_installed("bit64") vec <- bit64::as.integer64(c(0, 1, NA)) array <- nanoarrow_extension_array(vec, "some_ext") expect_warning( expect_identical( convert_array(array, bit64::integer64()), vec ), "Converting unknown extension" ) }) test_that("convert to vector errors for bad array to integer64()", { skip_if_not_installed("bit64") expect_error( convert_array(as_nanoarrow_array(letters), bit64::integer64()), "Can't convert array to R vector of type integer64" ) }) test_that("convert to vector works for character()", { array <- as_nanoarrow_array(letters) expect_identical( convert_array(array, character()), letters ) # make sure we get altrep here expect_true(is_nanoarrow_altrep(convert_array(array, character()))) # check an array that we can't convert expect_error( convert_array(as_nanoarrow_array(1:5), list()), "Can't convert array to R vector of type list" ) }) test_that("convert to vector works for null -> character()", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 all_nulls <- convert_array(array, character()) nanoarrow_altrep_force_materialize(all_nulls) expect_identical( all_nulls, rep(NA_character_, 10) ) }) test_that("convert to vector works for extension -> character()", { array <- nanoarrow_extension_array(c("a", "b", NA_character_), "some_ext") expect_warning( expect_identical( convert_array(array, character()), c("a", "b", NA_character_) ), "Converting unknown extension" ) }) test_that("convert to vector works for dictionary -> character()", { array <- as_nanoarrow_array(factor(letters[5:1])) # Via S3 dispatch expect_identical( convert_array(array, character()), c("e", "d", "c", "b", "a") ) # Via C -> S3 dispatch expect_identical( convert_array.default(array, character()), c("e", "d", "c", "b", "a") ) }) test_that("convert to vector works for dictionary -> factor()", { array <- as_nanoarrow_array(factor(letters[5:1])) # With empty levels expect_identical( convert_array(array, factor()), factor(letters[5:1]) ) # With identical levels expect_identical( convert_array(array, factor(levels = c("a", "b", "c", "d", "e"))), factor(letters[5:1]) ) # With mismatched levels expect_identical( convert_array(array, factor(levels = c("b", "a", "c", "e", "d"))), factor(letters[5:1], levels = c("b", "a", "c", "e", "d")) ) expect_error( convert_array(array, factor(levels = letters[-4])), "some levels in data do not exist" ) }) test_that("batched convert to vector works for dictionary -> factor()", { # A slightly different path: convert_array.factor() called from C multiple # times with different dictionaries each time. array1 <- as_nanoarrow_array(factor(letters[1:5])) array2 <- as_nanoarrow_array(factor(letters[6:10])) array3 <- as_nanoarrow_array(factor(letters[11:15])) stream <- basic_array_stream(list(array1, array2, array3)) expect_identical( convert_array_stream(stream, factor(levels = letters)), factor(letters[1:15], levels = letters) ) }) test_that("batched convert to vector errors for dictionary -> factor()", { # We can't currently handle a preallocate + fill style conversion where the # result is partial_factor(). array1 <- as_nanoarrow_array(factor(letters[1:5])) array2 <- as_nanoarrow_array(factor(letters[6:10])) array3 <- as_nanoarrow_array(factor(letters[11:15])) stream <- basic_array_stream(list(array1, array2, array3)) expect_error( convert_array_stream(stream, factor()), "Can't allocate ptype of class 'factor'" ) }) test_that("convert to vector works for blob::blob()", { skip_if_not_installed("blob") array <- as_nanoarrow_array(list(as.raw(1:5)), schema = na_binary()) expect_identical( convert_array(array), blob::blob(as.raw(1:5)) ) expect_identical( convert_array(array, blob::blob()), blob::blob(as.raw(1:5)) ) }) test_that("convert to vector works for null -> blob::blob()", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 expect_identical( convert_array(array, blob::blob()), blob::new_blob(rep(list(NULL), 10)) ) }) test_that("convert to vector works for list -> vctrs::list_of", { skip_if_not_installed("arrow") array_list <- as_nanoarrow_array( arrow::Array$create( list(1:5, 6:10, NULL), type = arrow::list_of(arrow::int32()) ) ) # Default conversion expect_identical( convert_array(array_list), vctrs::list_of(1:5, 6:10, NULL, .ptype = integer()) ) # With explicit ptype expect_identical( convert_array(array_list, vctrs::list_of(.ptype = double())), vctrs::list_of(as.double(1:5), as.double(6:10), NULL, .ptype = double()) ) # With bad ptype expect_error( convert_array(array_list, vctrs::list_of(.ptype = list())), "Can't convert `item`" ) # With malformed ptype ptype <- vctrs::list_of(.ptype = character()) attr(ptype, "ptype") <- NULL expect_error( convert_array(array_list, ptype), "Expected attribute 'ptype'" ) }) test_that("convert to vector works for large_list -> vctrs::list_of", { skip_if_not_installed("arrow") array_list <- as_nanoarrow_array( arrow::Array$create( list(1:5, 6:10, NULL), type = arrow::large_list_of(arrow::int32()) ) ) # Default conversion expect_identical( convert_array(array_list), vctrs::list_of(1:5, 6:10, NULL, .ptype = integer()) ) # With explicit ptype expect_identical( convert_array(array_list, vctrs::list_of(.ptype = double())), vctrs::list_of(as.double(1:5), as.double(6:10), NULL, .ptype = double()) ) # With bad ptype expect_error( convert_array(array_list, vctrs::list_of(.ptype = list())), "Can't convert `item`" ) }) test_that("convert to vector works for fixed_size_list -> vctrs::list_of", { skip_if_not_installed("arrow") array_list <- as_nanoarrow_array( arrow::Array$create( list(1:5, 6:10, NULL), type = arrow::fixed_size_list_of(arrow::int32(), 5) ) ) # Default conversion expect_identical( convert_array(array_list), vctrs::list_of(1:5, 6:10, NULL, .ptype = integer()) ) # With explicit ptype expect_identical( convert_array(array_list, vctrs::list_of(.ptype = double())), vctrs::list_of(as.double(1:5), as.double(6:10), NULL, .ptype = double()) ) # With bad ptype expect_error( convert_array(array_list, vctrs::list_of(.ptype = list())), "Can't convert `item`" ) }) test_that("convert to vector works for null -> vctrs::list_of()", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 expect_identical( convert_array(array, vctrs::list_of(.ptype = integer())), vctrs::new_list_of(rep(list(NULL), 10), ptype = integer()) ) }) test_that("convert to vector works for Date", { array_date <- as_nanoarrow_array(as.Date(c(NA, "2000-01-01"))) expect_identical( convert_array(array_date), as.Date(c(NA, "2000-01-01")) ) array_date <- as_nanoarrow_array( as.Date(c(NA, "2000-01-01")), schema = na_date64() ) expect_identical( convert_array(array_date), as.POSIXct(c(NA, "2000-01-01"), tz = "UTC") ) }) test_that("convert to vector works for null -> Date", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 expect_identical( convert_array(array, as.Date(character())), as.Date(rep(NA_character_, 10)) ) }) test_that("convert to vector works for hms", { array_time <- as_nanoarrow_array(hms::parse_hm("12:34")) expect_identical( convert_array(array_time), hms::parse_hm("12:34") ) }) test_that("convert to vector works for null -> hms", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 expect_identical( convert_array(array, hms::hms()), hms::parse_hms(rep(NA_character_, 10)) ) }) test_that("convert to vector works for POSIXct", { array_timestamp <- as_nanoarrow_array( as.POSIXct("2000-01-01 12:33", tz = "America/Halifax") ) expect_identical( convert_array(array_timestamp), as.POSIXct("2000-01-01 12:33", tz = "America/Halifax") ) }) test_that("convert to vector works for null -> POSIXct", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 expect_identical( convert_array(array, as.POSIXct(character(), tz = "America/Halifax")), as.POSIXct(rep(NA_character_, 10), tz = "America/Halifax") ) }) test_that("convert to vector works for difftime", { x <- as.difftime(123, units = "secs") array_duration <- as_nanoarrow_array(x) # default expect_identical(convert_array(array_duration), x) # explicit expect_identical(convert_array(array_duration, x), x) # explicit with other difftime units units(x) <- "mins" expect_identical(convert_array(array_duration, x), x) units(x) <- "hours" expect_identical(convert_array(array_duration, x), x) units(x) <- "days" expect_identical(convert_array(array_duration, x), x) units(x) <- "weeks" expect_equal(convert_array(array_duration, x), x) # with all Arrow units x <- as.difftime(123, units = "secs") array_duration <- as_nanoarrow_array(x, na_duration("s")) expect_identical(convert_array(array_duration), x) array_duration <- as_nanoarrow_array(x, na_duration("ms")) expect_identical(convert_array(array_duration), x) array_duration <- as_nanoarrow_array(x, na_duration("us")) expect_identical(convert_array(array_duration), x) array_duration <- as_nanoarrow_array(x, na_duration("ns")) expect_equal(convert_array(array_duration), x) # bad ptype values attr(x, "units") <- NULL expect_error( convert_array(array_duration, x), "Expected difftime 'units' attribute of type" ) attr(x, "units") <- character() expect_error( convert_array(array_duration, x), "Expected difftime 'units' attribute of type" ) attr(x, "units") <- integer(1) expect_error( convert_array(array_duration, x), "Expected difftime 'units' attribute of type" ) attr(x, "units") <- "gazornenplat" expect_error( convert_array(array_duration, x), "Unexpected value for difftime 'units' attribute" ) attr(x, "units") <- NA_character_ expect_error( convert_array(array_duration, x), "Unexpected value for difftime 'units' attribute" ) }) test_that("convert to vector works for null -> difftime", { array <- nanoarrow_array_init(na_na()) array$length <- 10 array$null_count <- 10 expect_identical( convert_array(array, as.difftime(numeric(), units = "secs")), as.difftime(rep(NA_real_, 10), units = "secs") ) }) test_that("convert to vector works for data frames nested inside lists", { skip_if_not_installed("arrow") df_in_list <- vctrs::list_of( data.frame(x = 1:5), data.frame(x = 6:10), data.frame(x = 11:15) ) nested_array <- as_nanoarrow_array(df_in_list) expect_identical( convert_array(nested_array), df_in_list ) }) test_that("convert to vector works for lists nested in data frames", { skip_if_not_installed("arrow") df_in_list_in_df <- data.frame( x = vctrs::list_of( data.frame(x = 1:5), data.frame(x = 6:10), data.frame(x = 11:15) ) ) nested_array <- as_nanoarrow_array(df_in_list_in_df) expect_identical( convert_array(nested_array), df_in_list_in_df ) }) nanoarrow/tests/testthat/test-convert-array-stream.R0000644000176200001440000001307014502402562022450 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("convert array stream works", { stream0 <- basic_array_stream(list(), schema = na_struct(list(x = na_int32()))) expect_identical(convert_array_stream(stream0), data.frame(x = integer())) stream1 <- basic_array_stream(list(data.frame(x = 1:5))) expect_identical(convert_array_stream(stream1), data.frame(x = 1:5)) stream2 <- basic_array_stream( list( data.frame(x = 1:5), data.frame(x = 6:10) ) ) expect_identical(convert_array_stream(stream2), data.frame(x = 1:10)) stream3 <- basic_array_stream(list(), schema = na_int32()) expect_identical(convert_array_stream(stream3), integer()) }) test_that("convert array stream with explicit size works", { stream0 <- basic_array_stream(list(), schema = na_struct(list(x = na_int32()))) expect_identical( convert_array_stream(stream0, size = 0), data.frame(x = integer()) ) stream1 <- basic_array_stream(list(data.frame(x = 1:5))) expect_identical( convert_array_stream(stream1, size = 5), data.frame(x = 1:5) ) stream2 <- basic_array_stream( list( data.frame(x = 1:5), data.frame(x = 6:10) ) ) expect_identical( convert_array_stream(stream2, size = 10), data.frame(x = 1:10) ) }) test_that("convert array stream with functional ptype works", { tibble_or_bust <- function(array, ptype) { if (is.data.frame(ptype)) { ptype <- tibble::as_tibble(ptype) ptype[] <- Map(tibble_or_bust, list(NULL), ptype) } ptype } df_nested_df <- as.data.frame( tibble::tibble(a = 1L, b = "two", c = data.frame(a = 3)) ) stream_nested <- as_nanoarrow_array_stream(df_nested_df) expect_identical( convert_array_stream(stream_nested, tibble_or_bust), tibble::tibble(a = 1L, b = "two", c = tibble::tibble(a = 3)) ) }) test_that("convert array stream works for nested data.frames", { tbl_nested_df <- tibble::tibble(a = 1L, b = "two", c = data.frame(a = 3)) stream_nested <- as_nanoarrow_array_stream(tbl_nested_df) expect_identical( convert_array_stream(stream_nested, tbl_nested_df), tbl_nested_df ) stream_nested <- as_nanoarrow_array_stream(tbl_nested_df) expect_identical( convert_array_stream(stream_nested, size = 1L), as.data.frame(tbl_nested_df) ) stream_nested <- as_nanoarrow_array_stream(tbl_nested_df) expect_identical( convert_array_stream(stream_nested, tbl_nested_df, size = 1L), tbl_nested_df ) }) test_that("convert array stream works for struct-style vectors", { raw_posixlt <- as.data.frame( unclass(as.POSIXlt("2021-01-01", tz = "America/Halifax")), stringsAsFactors = FALSE ) stream <- as_nanoarrow_array_stream(raw_posixlt) expect_identical( convert_array_stream(stream), raw_posixlt ) stream <- as_nanoarrow_array_stream(raw_posixlt) expect_identical( convert_array_stream(stream, as.POSIXlt("2021-01-01", tz = "America/Halifax")), as.POSIXlt("2021-01-01", tz = "America/Halifax") ) # Check with fixed size since this takes a different code path stream <- as_nanoarrow_array_stream(raw_posixlt) expect_identical( convert_array_stream(stream, size = 1L), raw_posixlt ) stream <- as_nanoarrow_array_stream(raw_posixlt) expect_identical( convert_array_stream( stream, as.POSIXlt("2021-01-01", tz = "America/Halifax"), size = 1 ), as.POSIXlt("2021-01-01", tz = "America/Halifax") ) }) test_that("convert array stream respects the value of n", { batches <- list( data.frame(x = 1:5), data.frame(x = 6:10), data.frame(x = 11:15) ) stream3 <- basic_array_stream(batches) expect_identical( convert_array_stream(stream3, n = 0), data.frame(x = integer()) ) stream3 <- basic_array_stream(batches) expect_identical( convert_array_stream(stream3, n = 1), data.frame(x = 1:5) ) stream3 <- basic_array_stream(batches) expect_identical( convert_array_stream(stream3, n = 2), data.frame(x = 1:10) ) }) test_that("fixed-size convert array stream respects the value of n", { batches <- list( data.frame(x = 1:5), data.frame(x = 6:10), data.frame(x = 11:15) ) stream3 <- basic_array_stream(batches) expect_identical( convert_array_stream(stream3, n = 0, size = 0), data.frame(x = integer()) ) stream3 <- basic_array_stream(batches) expect_identical( convert_array_stream(stream3, n = 1, size = 5), data.frame(x = 1:5) ) stream3 <- basic_array_stream(batches) expect_identical( convert_array_stream(stream3, n = 2, size = 10), data.frame(x = 1:10) ) }) test_that("fixed-size stream conversion errors when the output has insufficient size", { stream <- as_nanoarrow_array_stream(data.frame(x = 1:100)) expect_error( convert_array_stream(stream, size = 2), "Expected to materialize 100 values in batch 1 but materialized 2" ) }) nanoarrow/tests/testthat/test-type.R0000644000176200001440000001305314502402562017345 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("type constructors for parameter-free types work", { # Some of these types have parameters but also have default values parameter_free_types <- c( "na", "bool", "uint8", "int8", "uint16", "int16", "uint32", "int32", "uint64", "int64", "half_float", "float", "double", "string", "binary", "date32", "date64", "timestamp", "time32", "time64", "interval_months", "interval_day_time", "struct", "duration", "large_string", "large_binary", "interval_month_day_nano" ) for (type_name in parameter_free_types) { # Check that the right type gets created expect_identical( nanoarrow_schema_parse(na_type(!!type_name))$type, !!type_name ) # Check that the default schema is nullable if (type_name == "struct") { expect_identical(na_type(!!type_name)$flags, 0L) } else { expect_identical(na_type(!!type_name)$flags, 2L) } # Check that non-nullable schemas are non-nullable expect_identical(na_type(!!type_name, nullable = FALSE)$flags, 0L) } }) test_that("non-logical nullable values do not crash", { expect_identical(na_na(nullable = NULL)$flags, 0L) expect_identical(na_time32(nullable = NULL)$flags, 0L) expect_identical(na_fixed_size_binary(1, nullable = NULL)$flags, 0L) expect_identical(na_decimal128(1, 1, nullable = NULL)$flags, 0L) }) test_that("timestamp type passes along timezone parameter", { schema <- na_timestamp(timezone = "UTC") expect_identical(nanoarrow_schema_parse(schema)$timezone, "UTC") expect_error( na_timestamp(timezone = NULL), "must be character" ) expect_error( na_timestamp(timezone = NA_character_), "must be character" ) expect_error( na_timestamp(timezone = character()), "must be character" ) }) test_that("decimal types pass along precision and scale", { schema <- na_decimal128(12, 10) expect_identical(nanoarrow_schema_parse(schema)$decimal_precision, 12L) expect_identical(nanoarrow_schema_parse(schema)$decimal_scale, 10L) schema <- na_decimal256(12, 10) expect_identical(nanoarrow_schema_parse(schema)$decimal_precision, 12L) expect_identical(nanoarrow_schema_parse(schema)$decimal_scale, 10L) }) test_that("fixed-size binary passes along fixed-size parameter", { schema <- na_fixed_size_binary(123) expect_identical(nanoarrow_schema_parse(schema)$fixed_size, 123L) }) test_that("struct constructor passes along children", { schema <- na_struct(list(col_name = na_int32())) expect_identical(schema$format, "+s") expect_named(schema$children, "col_name") expect_identical(schema$children[[1]]$format, "i") }) test_that("struct constructor passes along children", { schema <- na_struct(list(col_name = na_int32())) expect_identical(schema$format, "+s") expect_named(schema$children, "col_name") expect_identical(schema$children[[1]]$format, "i") }) test_that("sparse and dense unions can be created", { schema <- na_sparse_union(list(na_int32(), na_string())) expect_identical(nanoarrow_schema_parse(schema)$union_type_ids, c(0L, 1L)) schema <- na_dense_union(list(na_int32(), na_string())) expect_identical(nanoarrow_schema_parse(schema)$union_type_ids, c(0L, 1L)) }) test_that("list constructors assign the correct child type", { schema <- na_list(na_int32()) expect_identical(schema$format, "+l") expect_named(schema$children, "item") expect_identical(schema$children[[1]]$format, "i") schema <- na_large_list(na_int32()) expect_identical(schema$format, "+L") expect_named(schema$children, "item") expect_identical(schema$children[[1]]$format, "i") schema <- na_fixed_size_list(na_int32(), 123) expect_identical(schema$format, "+w:123") expect_named(schema$children, "item") expect_identical(schema$children[[1]]$format, "i") }) test_that("map constructor assigns the correct key and value types", { schema <- na_map(na_int32(nullable = FALSE), na_int64()) expect_named(schema$children, "entries") expect_named(schema$children$entries$children, c("key", "value")) expect_identical(schema$children$entries$children$key$format, "i") expect_identical(schema$children$entries$children$value$format, "l") }) test_that("dictionary types can be created", { schema <- na_dictionary(na_string(), ordered = FALSE) expect_identical(schema$format, "i") expect_identical(schema$dictionary$format, "u") expect_identical(schema$flags, ARROW_FLAG$NULLABLE) schema <- na_dictionary(na_string(), ordered = TRUE) expect_identical( schema$flags, bitwOr(ARROW_FLAG$NULLABLE, ARROW_FLAG$DICTIONARY_ORDERED) ) }) test_that("extension types can be created", { schema <- na_extension(na_int32(), "ext_name", "ext_meta") expect_identical(nanoarrow_schema_parse(schema)$extension_name, "ext_name") expect_identical( nanoarrow_schema_parse(schema)$extension_metadata, charToRaw("ext_meta") ) }) nanoarrow/tests/testthat/test-as-array.R0000644000176200001440000005425714547061553020130 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. test_that("as_nanoarrow_array() works for nanoarrow_array", { array <- as_nanoarrow_array(1:10) expect_identical(as_nanoarrow_array(array), array) array <- as_nanoarrow_array(1:10, schema = na_int32()) expect_identical(as_nanoarrow_array(array), array) skip_if_not_installed("arrow") casted <- as_nanoarrow_array(array, schema = na_int64()) expect_identical(infer_nanoarrow_schema(casted)$format, "l") expect_identical(convert_array(casted), as.double(1:10)) }) test_that("as_nanoarrow_array() works for logical() -> na_bool()", { # Without nulls array <- as_nanoarrow_array(c(TRUE, FALSE, TRUE, FALSE), schema = na_bool()) expect_identical(infer_nanoarrow_schema(array)$format, "b") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[2]]), as.raw(packBits(c(TRUE, FALSE, TRUE, FALSE, rep(FALSE, 4)))) ) # With nulls array <- as_nanoarrow_array(c(TRUE, FALSE, NA), schema = na_bool()) expect_identical(infer_nanoarrow_schema(array)$format, "b") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 2), FALSE, rep(FALSE, 5))) ) expect_identical( as.raw(array$buffers[[2]]), as.raw(packBits(c(TRUE, FALSE, FALSE, rep(FALSE, 5)))) ) }) test_that("as_nanoarrow_array() errors for bad logical() creation", { skip_if_not_installed("arrow") expect_snapshot_error( as_nanoarrow_array(TRUE, schema = na_string()) ) }) test_that("as_nanoarrow_array() works for logical() -> na_int32()", { # Without nulls array <- as_nanoarrow_array(c(TRUE, FALSE, TRUE, FALSE), schema = na_int32()) expect_identical(infer_nanoarrow_schema(array)$format, "i") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(TRUE, FALSE, TRUE, FALSE))) ) # With nulls array <- as_nanoarrow_array(c(TRUE, FALSE, NA), schema = na_int32()) expect_identical(infer_nanoarrow_schema(array)$format, "i") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 2), FALSE, rep(FALSE, 5))) ) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(TRUE, FALSE, NA))) ) }) test_that("as_nanoarrow_array() works for integer() -> na_int32()", { # Without nulls array <- as_nanoarrow_array(1:10) expect_identical(infer_nanoarrow_schema(array)$format, "i") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical(as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(1:10))) # With nulls array <- as_nanoarrow_array(c(1:10, NA)) expect_identical(infer_nanoarrow_schema(array)$format, "i") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 10), FALSE, rep(FALSE, 5))) ) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(1:10, NA))) ) }) test_that("as_nanoarrow_array() works for integer -> na_int64()", { skip_if_not_installed("arrow") casted <- as_nanoarrow_array(1:10, schema = na_int64()) expect_identical(infer_nanoarrow_schema(casted)$format, "l") expect_identical(convert_array(casted), as.double(1:10)) }) test_that("as_nanoarrow_array() works for double() -> na_double()", { # Without nulls array <- as_nanoarrow_array(as.double(1:10)) expect_identical(infer_nanoarrow_schema(array)$format, "g") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(as.double(1:10))) ) # With nulls array <- as_nanoarrow_array(c(1:10, NA_real_)) expect_identical(infer_nanoarrow_schema(array)$format, "g") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 10), FALSE, rep(FALSE, 5))) ) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(1:10, NA_real_))) ) }) test_that("as_nanoarrow_array() works for double() -> na_int32()", { # Without nulls array <- as_nanoarrow_array(as.double(1:10), schema = na_int32()) expect_identical(infer_nanoarrow_schema(array)$format, "i") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(1:10)) ) # With nulls array <- as_nanoarrow_array(c(1:10, NA_real_), schema = na_int32()) expect_identical(infer_nanoarrow_schema(array)$format, "i") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 10), FALSE, rep(FALSE, 5))) ) # The last element here is 0 because (int)nan is undefined behaviour expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(1:10, 0L))) ) # With overflow expect_warning( as_nanoarrow_array(.Machine$integer.max + as.double(1:5), schema = na_int32()), class = "nanoarrow_warning_lossy_conversion" ) }) test_that("as_nanoarrow_array() works for double() -> na_int64()", { # Without nulls array <- as_nanoarrow_array(as.double(1:10), schema = na_int64()) expect_identical(infer_nanoarrow_schema(array)$format, "l") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) # This *is* how we create int64 buffers, so just check the roundtrip expect_identical(convert_array(array), as.double(1:10)) # With nulls array <- as_nanoarrow_array(c(1:10, NA_real_), schema = na_int64()) expect_identical(infer_nanoarrow_schema(array)$format, "l") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 10), FALSE, rep(FALSE, 5))) ) expect_identical(convert_array(array), as.double(c(1:10, NA_real_))) }) test_that("as_nanoarrow_array() works for integer64() -> na_int32()", { skip_if_not_installed("bit64") # Without nulls array <- as_nanoarrow_array(bit64::as.integer64(1:10), schema = na_int32()) expect_identical(infer_nanoarrow_schema(array)$format, "i") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(1:10)) ) # With nulls array <- as_nanoarrow_array(bit64::as.integer64(c(1:10, NA_real_)), schema = na_int32()) expect_identical(infer_nanoarrow_schema(array)$format, "i") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 10), FALSE, rep(FALSE, 5))) ) # The last element here is 0 because (int)nan is undefined behaviour expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(1:10, 0L))) ) }) test_that("as_nanoarrow_array() works for integer64() -> na_int64()", { skip_if_not_installed("bit64") # Default roundtrip array <- as_nanoarrow_array(bit64::as.integer64(1:10)) expect_identical(convert_array(array, double()), as.double(1:10)) # Without nulls array <- as_nanoarrow_array(bit64::as.integer64(1:10), schema = na_int64()) expect_identical(infer_nanoarrow_schema(array)$format, "l") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) # This *is* how we create int64 buffers, so just check the roundtrip expect_identical(convert_array(array, double()), as.double(1:10)) # With nulls array <- as_nanoarrow_array(bit64::as.integer64(c(1:10, NA_real_)), schema = na_int64()) expect_identical(infer_nanoarrow_schema(array)$format, "l") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 10), FALSE, rep(FALSE, 5))) ) expect_identical(convert_array(array, double()), as.double(c(1:10, NA_real_))) }) test_that("as_nanoarrow_array() works for double -> na_int8()", { skip_if_not_installed("arrow") casted <- as_nanoarrow_array(as.double(1:10), schema = na_int8()) expect_identical(infer_nanoarrow_schema(casted)$format, "c") expect_identical(convert_array(casted), 1:10) }) test_that("as_nanoarrow_array() works for character() -> na_string()", { # Without nulls array <- as_nanoarrow_array(letters) expect_identical(infer_nanoarrow_schema(array)$format, "u") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(0:26)) ) expect_identical( as.raw(array$buffers[[3]]), as.raw(as_nanoarrow_buffer(paste(letters, collapse = ""))) ) # With nulls array <- as_nanoarrow_array(c(letters, NA)) expect_identical(infer_nanoarrow_schema(array)$format, "u") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 26), FALSE, rep(FALSE, 5))) ) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(0:26, 26L))) ) expect_identical( as.raw(array$buffers[[3]]), as.raw(as_nanoarrow_buffer(paste(letters, collapse = ""))) ) }) test_that("as_nanoarrow_array() works for character() -> na_large_string()", { skip_if_not_installed("arrow") # Without nulls array <- as_nanoarrow_array(letters, schema = na_large_string()) expect_identical(infer_nanoarrow_schema(array)$format, "U") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[3]]), as.raw(as_nanoarrow_buffer(paste(letters, collapse = ""))) ) # With nulls array <- as_nanoarrow_array(c(letters, NA), schema = na_large_string()) expect_identical(infer_nanoarrow_schema(array)$format, "U") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 26), FALSE, rep(FALSE, 5))) ) expect_identical( as.raw(array$buffers[[3]]), as.raw(as_nanoarrow_buffer(paste(letters, collapse = ""))) ) }) test_that("as_nanoarrow_array() works for factor() -> na_dictionary()", { array <- as_nanoarrow_array( factor(letters), schema = na_dictionary(na_string(), na_int32()) ) expect_identical(infer_nanoarrow_schema(array)$format, "i") expect_identical(infer_nanoarrow_schema(array$dictionary)$format, "u") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(0:25)) ) expect_identical( as.raw(array$dictionary$buffers[[3]]), charToRaw(paste0(letters, collapse = "")) ) }) test_that("as_nanoarrow_array() works for factor() -> na_string()", { array <- as_nanoarrow_array( factor(letters), schema = na_string() ) expect_identical(infer_nanoarrow_schema(array)$format, "u") expect_null(array$dictionary) expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(0:26)) ) expect_identical( as.raw(array$buffers[[3]]), charToRaw(paste0(letters, collapse = "")) ) }) test_that("as_nanoarrow_array() works for data.frame() -> na_struct()", { array <- as_nanoarrow_array(data.frame(x = 1:10)) expect_identical(array$length, 10L) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical(infer_nanoarrow_schema(array)$format, "+s") expect_identical(infer_nanoarrow_schema(array$children$x)$format, "i") expect_identical(as.raw(array$children$x$buffers[[2]]), as.raw(as_nanoarrow_buffer(1:10))) }) test_that("as_nanoarrow_array() errors for bad data.frame() -> na_struct()", { expect_error( as_nanoarrow_array(data.frame(x = 1:10), schema = na_struct()), "Expected 1 schema children" ) skip_if_not_installed("arrow") expect_snapshot_error( as_nanoarrow_array(data.frame(x = 1:10), schema = na_int32()) ) }) test_that("as_nanoarrow_array() works for Date -> na_date32()", { array <- as_nanoarrow_array(as.Date(c("2000-01-01", "2023-02-03", NA))) expect_identical(infer_nanoarrow_schema(array)$format, "tdD") expect_identical(array$length, 3L) expect_identical(array$null_count, 1L) expect_identical(as.raw(array$buffers[[1]]), as.raw(0x03)) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(10957L, 19391L, NA))) ) }) test_that("as_nanoarrow_array() works for Date -> na_date64()", { array <- as_nanoarrow_array( as.Date(c("2000-01-01", "2023-02-03", NA)), schema = na_date64() ) expect_identical(infer_nanoarrow_schema(array)$format, "tdm") expect_identical(array$length, 3L) expect_identical(array$null_count, 1L) expect_identical(as.raw(array$buffers[[1]]), as.raw(0x03)) storage <- as_nanoarrow_array( c(10957L, 19391L, NA) * 86400000, schema = na_int64() ) expect_identical( as.raw(array$buffers[[2]]), as.raw(storage$buffers[[2]]) ) }) test_that("as_nanoarrow_array() works for POSIXct -> na_timestamp()", { array <- as_nanoarrow_array( as.POSIXct(c("2000-01-01", "2023-02-03", NA), tz = "UTC"), schema = na_timestamp("ms", timezone = "UTC") ) expect_identical(infer_nanoarrow_schema(array)$format, "tsm:UTC") expect_identical(array$length, 3L) expect_identical(array$null_count, 1L) expect_identical(as.raw(array$buffers[[1]]), as.raw(0x03)) storage <- as_nanoarrow_array( c(10957L, 19391L, NA) * 86400000, schema = na_int64() ) expect_identical( as.raw(array$buffers[[2]]), as.raw(storage$buffers[[2]]) ) }) test_that("as_nanoarrow_array() works for difftime -> na_duration()", { array <- as_nanoarrow_array( as.difftime(c(1:5, NA), units = "secs"), schema = na_duration("ms") ) expect_identical(infer_nanoarrow_schema(array)$format, "tDm") expect_identical(array$length, 6L) expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 5), FALSE, rep(FALSE, 2))) ) storage <- as_nanoarrow_array( c(1:5, NA) * 1000, schema = na_int64() ) expect_identical( as.raw(array$buffers[[2]]), as.raw(storage$buffers[[2]]) ) }) test_that("as_nanoarrow_array() works for blob::blob() -> na_binary()", { skip_if_not_installed("blob") # Without nulls array <- as_nanoarrow_array(blob::as_blob(letters)) expect_identical(infer_nanoarrow_schema(array)$format, "z") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(0:26)) ) expect_identical( as.raw(array$buffers[[3]]), as.raw(as_nanoarrow_buffer(paste(letters, collapse = ""))) ) # With nulls array <- as_nanoarrow_array(blob::as_blob(c(letters, NA))) expect_identical(infer_nanoarrow_schema(array)$format, "z") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 26), FALSE, rep(FALSE, 5))) ) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(0:26, 26L))) ) expect_identical( as.raw(array$buffers[[3]]), as.raw(as_nanoarrow_buffer(paste(letters, collapse = ""))) ) }) test_that("as_nanoarrow_array() works for blob::blob() -> na_large_binary()", { skip_if_not_installed("arrow") # Without nulls array <- as_nanoarrow_array(blob::as_blob(letters), schema = na_large_binary()) expect_identical(infer_nanoarrow_schema(array)$format, "Z") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[3]]), as.raw(as_nanoarrow_buffer(paste(letters, collapse = ""))) ) # With nulls array <- as_nanoarrow_array( blob::as_blob(c(letters, NA)), schema = na_large_binary() ) expect_identical(infer_nanoarrow_schema(array)$format, "Z") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 26), FALSE, rep(FALSE, 5))) ) expect_identical( as.raw(array$buffers[[3]]), as.raw(as_nanoarrow_buffer(paste(letters, collapse = ""))) ) }) test_that("as_nanoarrow_array() works for list(raw()) -> na_binary()", { # Without nulls array <- as_nanoarrow_array(lapply(letters, charToRaw)) expect_identical(infer_nanoarrow_schema(array)$format, "z") expect_identical(as.raw(array$buffers[[1]]), raw()) expect_identical(array$offset, 0L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(0:26)) ) expect_identical( as.raw(array$buffers[[3]]), as.raw(as_nanoarrow_buffer(paste(letters, collapse = ""))) ) # With nulls array <- as_nanoarrow_array(c(lapply(letters, charToRaw), list(NULL))) expect_identical(infer_nanoarrow_schema(array)$format, "z") expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), packBits(c(rep(TRUE, 26), FALSE, rep(FALSE, 5))) ) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(0:26, 26L))) ) expect_identical( as.raw(array$buffers[[3]]), as.raw(as_nanoarrow_buffer(paste(letters, collapse = ""))) ) }) test_that("as_nanoarrow_array() works for list(NULL) -> na_list(na_na())", { array <- as_nanoarrow_array(list(NULL)) expect_identical(infer_nanoarrow_schema(array)$format, "+l") expect_identical(array$length, 1L) expect_identical(array$null_count, 1L) expect_identical( as.raw(array$buffers[[1]]), as.raw(as_nanoarrow_array(FALSE)$buffers[[2]]) ) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(0L, 0L))) ) expect_identical(infer_nanoarrow_schema(array$children[[1]])$format, "n") expect_identical(array$children[[1]]$length, 0L) }) test_that("as_nanoarrow_array() works for list(integer()) -> na_list(na_int32())", { array <- as_nanoarrow_array(list(1:5, 6:10), schema = na_list(na_int32())) expect_identical(infer_nanoarrow_schema(array)$format, "+l") expect_identical(array$length, 2L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[1]]), as.raw(as_nanoarrow_array(c(TRUE, TRUE))$buffers[[2]]) ) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(0L, 5L, 10L))) ) expect_identical(infer_nanoarrow_schema(array$children[[1]])$format, "i") expect_identical(array$children[[1]]$length, 10L) }) test_that("as_nanoarrow_array() works for unspecified() -> na_na()", { skip_if_not_installed("vctrs") array <- as_nanoarrow_array(vctrs::unspecified(5)) expect_identical(infer_nanoarrow_schema(array)$format, "n") expect_identical(array$length, 5L) expect_identical(array$null_count, 5L) }) test_that("as_nanoarrow_array() works for bad unspecified() create", { skip_if_not_installed("vctrs") skip_if_not_installed("arrow") expect_snapshot_error( as_nanoarrow_array(vctrs::unspecified(5), schema = na_interval_day_time()) ) }) test_that("as_nanoarrow_array() can convert data.frame() to sparse_union()", { # Features: At least one element with more than one non-NA value, # one element with all NA values. test_df <- data.frame( lgl = c(TRUE, NA, NA, NA, NA, FALSE), int = c(NA, 123L, NA, NA, NA, NA), dbl = c(NA, NA, 456, NA, NA, NA), chr = c(NA, NA, NA, "789", NA, NA), stringsAsFactors = FALSE ) array <- as_nanoarrow_array( test_df, schema = na_sparse_union(lapply(test_df, infer_nanoarrow_schema)) ) expect_identical(infer_nanoarrow_schema(array)$format, "+us:0,1,2,3") expect_identical(array$length, 6L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[1]]), as.raw(as_nanoarrow_buffer(as.raw(c(0L, 1L, 2L, 3L, 0L, 0L)))) ) expect_identical( lapply(array$children, convert_array), lapply(test_df, identity) ) expect_identical(convert_array(array), test_df) }) test_that("as_nanoarrow_array() can convert data.frame() to sparse_union()", { test_df <- data.frame( lgl = c(TRUE, NA, NA, NA, NA, FALSE), int = c(NA, 123L, NA, NA, NA, NA), dbl = c(NA, NA, 456, NA, NA, NA), chr = c(NA, NA, NA, "789", NA, NA), stringsAsFactors = FALSE ) array <- as_nanoarrow_array( test_df, schema = na_dense_union(lapply(test_df, infer_nanoarrow_schema)) ) expect_identical(infer_nanoarrow_schema(array)$format, "+ud:0,1,2,3") expect_identical(array$length, 6L) expect_identical(array$null_count, 0L) expect_identical( as.raw(array$buffers[[1]]), as.raw(as_nanoarrow_buffer(as.raw(c(0L, 1L, 2L, 3L, 0L, 0L)))) ) expect_identical( as.raw(array$buffers[[2]]), as.raw(as_nanoarrow_buffer(c(0L, 0L, 0L, 0L, 1L, 2L))) ) expect_identical( lapply(array$children, convert_array), list( lgl = c(TRUE, NA, FALSE), int = 123L, dbl = 456, chr = "789" ) ) expect_identical(convert_array(array), test_df) }) test_that("as_nanoarrow_array() for union type errors for unsupported objects", { expect_error( as_nanoarrow_array(data.frame(), schema = na_dense_union()), "Can't convert data frame with 0 columns" ) }) nanoarrow/tests/testthat.R0000644000176200001440000000224214307221533015405 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. library(testthat) library(nanoarrow) verbose_test_output <- identical(tolower(Sys.getenv("ARROW_R_DEV", "false")), "true") || identical(tolower(Sys.getenv("ARROW_R_VERBOSE_TEST", "false")), "true") if (verbose_test_output) { reporter <- MultiReporter$new(list(CheckReporter$new(), LocationReporter$new())) } else { reporter <- check_reporter() } test_check("nanoarrow", reporter = reporter) nanoarrow/src/0000755000176200001440000000000014556775567013102 5ustar liggesusersnanoarrow/src/pointers.c0000644000176200001440000002333614547575511015103 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "array.h" #include "array_stream.h" #include "schema.h" // More reliable way to stringify intptr_t on Windows using C++ void intptr_as_string(intptr_t ptr_int, char* buf); SEXP nanoarrow_c_allocate_schema(void) { return nanoarrow_schema_owning_xptr(); } SEXP nanoarrow_c_allocate_array(void) { return nanoarrow_array_owning_xptr(); } SEXP nanoarrow_c_allocate_array_stream(void) { return nanoarow_array_stream_owning_xptr(); } SEXP nanoarrow_c_pointer(SEXP obj_sexp) { if (TYPEOF(obj_sexp) == EXTPTRSXP) { return obj_sexp; } else if (TYPEOF(obj_sexp) == REALSXP && Rf_length(obj_sexp) == 1) { // Note that this is not a good idea to actually do; however, is provided for // backward compatibility with early versions of the arrow R package. intptr_t ptr_int = (intptr_t)(REAL(obj_sexp)[0]); return R_MakeExternalPtr((void*)ptr_int, R_NilValue, R_NilValue); } else if (TYPEOF(obj_sexp) == STRSXP && Rf_length(obj_sexp) == 1) { const char* text = CHAR(STRING_ELT(obj_sexp, 0)); char* end_ptr; intptr_t ptr_int = strtoll(text, &end_ptr, 10); if (end_ptr != (text + strlen(text))) { Rf_error("'%s' could not be interpreted as an unsigned 64-bit integer", text); } return R_MakeExternalPtr((void*)ptr_int, R_NilValue, R_NilValue); } Rf_error("Pointer must be chr[1], dbl[1], or external pointer"); return R_NilValue; } SEXP nanoarrow_c_pointer_addr_dbl(SEXP ptr) { // Note that this is not a good idea to actually do; however, is provided for // backward compatibility with early versions of the arrow R package. uintptr_t ptr_int = (uintptr_t)R_ExternalPtrAddr(nanoarrow_c_pointer(ptr)); return Rf_ScalarReal((double)ptr_int); } SEXP nanoarrow_c_pointer_addr_chr(SEXP ptr) { intptr_t ptr_int = (intptr_t)R_ExternalPtrAddr(nanoarrow_c_pointer(ptr)); char addr_chars[100]; memset(addr_chars, 0, 100); intptr_as_string(ptr_int, addr_chars); return Rf_mkString(addr_chars); } SEXP nanoarrow_c_pointer_addr_pretty(SEXP ptr) { char addr_chars[100]; memset(addr_chars, 0, 100); snprintf(addr_chars, sizeof(addr_chars), "%p", R_ExternalPtrAddr(nanoarrow_c_pointer(ptr))); return Rf_mkString(addr_chars); } SEXP nanoarrow_c_pointer_is_valid(SEXP ptr) { if (Rf_inherits(ptr, "nanoarrow_schema")) { struct ArrowSchema* obj = (struct ArrowSchema*)R_ExternalPtrAddr(ptr); return Rf_ScalarLogical(obj != NULL && obj->release != NULL); } else if (Rf_inherits(ptr, "nanoarrow_array")) { struct ArrowArray* obj = (struct ArrowArray*)R_ExternalPtrAddr(ptr); return Rf_ScalarLogical(obj != NULL && obj->release != NULL); } else if (Rf_inherits(ptr, "nanoarrow_array_stream")) { struct ArrowArrayStream* obj = (struct ArrowArrayStream*)R_ExternalPtrAddr(ptr); return Rf_ScalarLogical(obj != NULL && obj->release != NULL); } else { Rf_error( "`ptr` must inherit from 'nanoarrow_schema', 'nanoarrow_array', or " "'nanoarrow_array_stream'"); } return R_NilValue; } SEXP nanoarrow_c_pointer_release(SEXP ptr) { if (Rf_inherits(ptr, "nanoarrow_schema")) { struct ArrowSchema* obj = (struct ArrowSchema*)R_ExternalPtrAddr(ptr); if (obj != NULL && obj->release != NULL) { obj->release(obj); obj->release = NULL; } } else if (Rf_inherits(ptr, "nanoarrow_array")) { struct ArrowArray* obj = (struct ArrowArray*)R_ExternalPtrAddr(ptr); if (obj != NULL && obj->release != NULL) { obj->release(obj); obj->release = NULL; } } else if (Rf_inherits(ptr, "nanoarrow_array_stream")) { struct ArrowArrayStream* obj = (struct ArrowArrayStream*)R_ExternalPtrAddr(ptr); if (obj != NULL && obj->release != NULL) { obj->release(obj); obj->release = NULL; } } else { Rf_error( "`ptr` must inherit from 'nanoarrow_schema', 'nanoarrow_array', or " "'nanoarrow_array_stream'"); } return R_NilValue; } SEXP nanoarrow_c_pointer_move(SEXP ptr_src, SEXP ptr_dst) { SEXP xptr_src = PROTECT(nanoarrow_c_pointer(ptr_src)); if (Rf_inherits(ptr_dst, "nanoarrow_schema")) { struct ArrowSchema* obj_dst = (struct ArrowSchema*)R_ExternalPtrAddr(ptr_dst); if (obj_dst == NULL) { Rf_error("`ptr_dst` is a pointer to NULL"); } if (obj_dst->release != NULL) { Rf_error("`ptr_dst` is a valid struct ArrowSchema"); } struct ArrowSchema* obj_src = (struct ArrowSchema*)R_ExternalPtrAddr(xptr_src); if (obj_src == NULL || obj_src->release == NULL) { Rf_error("`ptr_src` is not a valid struct ArrowSchema"); } ArrowSchemaMove(obj_src, obj_dst); } else if (Rf_inherits(ptr_dst, "nanoarrow_array")) { struct ArrowArray* obj_dst = (struct ArrowArray*)R_ExternalPtrAddr(ptr_dst); if (obj_dst == NULL) { Rf_error("`ptr_dst` is a pointer to NULL"); } if (obj_dst->release != NULL) { Rf_error("`ptr_dst` is a valid struct ArrowArray"); } struct ArrowArray* obj_src = (struct ArrowArray*)R_ExternalPtrAddr(xptr_src); if (obj_src == NULL || obj_src->release == NULL) { Rf_error("`ptr_src` is not a valid struct ArrowArray"); } ArrowArrayMove(obj_src, obj_dst); } else if (Rf_inherits(ptr_dst, "nanoarrow_array_stream")) { struct ArrowArrayStream* obj_dst = (struct ArrowArrayStream*)R_ExternalPtrAddr(ptr_dst); if (obj_dst == NULL) { Rf_error("`ptr_dst` is a pointer to NULL"); } if (obj_dst->release != NULL) { Rf_error("`ptr_dst` is a valid struct ArrowArrayStream"); } struct ArrowArrayStream* obj_src = (struct ArrowArrayStream*)R_ExternalPtrAddr(xptr_src); if (obj_src == NULL || obj_src->release == NULL) { Rf_error("`ptr_src` is not a valid struct ArrowArrayStream"); } ArrowArrayStreamMove(obj_src, obj_dst); } else { Rf_error( "`ptr_dst` must inherit from 'nanoarrow_schema', 'nanoarrow_array', or " "'nanoarrow_array_stream'"); } // also move SEXP dependencies R_SetExternalPtrProtected(ptr_dst, R_ExternalPtrProtected(xptr_src)); R_SetExternalPtrTag(ptr_dst, R_ExternalPtrTag(xptr_src)); R_SetExternalPtrProtected(xptr_src, R_NilValue); R_SetExternalPtrTag(xptr_src, R_NilValue); UNPROTECT(1); return R_NilValue; } // The rest of this package operates under the assumption that references // to a schema/array external pointer are kept by anything that needs // the underlying memory to persist. When the reference count reaches 0, // R calls the release callback (and nobody else). // When exporting to something that is expecting to call the release callback // itself (e.g., Arrow C++ via the arrow R package or pyarrow Python package), // the structure and the release callback need to keep the information. // schemas are less frequently iterated over and it's much simpler to // (recursively) copy the whole object and export it rather than try to // keep all the object dependencies alive and/or risk moving a dependency // of some other R object. SEXP nanoarrow_c_export_schema(SEXP schema_xptr, SEXP ptr_dst) { struct ArrowSchema* obj_src = nanoarrow_schema_from_xptr(schema_xptr); SEXP xptr_dst = PROTECT(nanoarrow_c_pointer(ptr_dst)); struct ArrowSchema* obj_dst = (struct ArrowSchema*)R_ExternalPtrAddr(xptr_dst); if (obj_dst == NULL) { Rf_error("`ptr_dst` is a pointer to NULL"); } if (obj_dst->release != NULL) { Rf_error("`ptr_dst` is a valid struct ArrowSchema"); } int result = ArrowSchemaDeepCopy(obj_src, obj_dst); if (result != NANOARROW_OK) { Rf_error("Failed to deep copy struct ArrowSchema"); } UNPROTECT(1); return R_NilValue; } SEXP nanoarrow_c_export_array(SEXP array_xptr, SEXP ptr_dst) { SEXP xptr_dst = PROTECT(nanoarrow_c_pointer(ptr_dst)); struct ArrowArray* obj_dst = (struct ArrowArray*)R_ExternalPtrAddr(xptr_dst); if (obj_dst == NULL) { Rf_error("`ptr_dst` is a pointer to NULL"); } if (obj_dst->release != NULL) { Rf_error("`ptr_dst` is a valid struct ArrowArray"); } array_export(array_xptr, obj_dst); UNPROTECT(1); return R_NilValue; } SEXP nanoarrow_c_export_array_stream(SEXP array_stream_xptr, SEXP ptr_dst) { SEXP xptr_dst = PROTECT(nanoarrow_c_pointer(ptr_dst)); struct ArrowArrayStream* obj_dst = (struct ArrowArrayStream*)R_ExternalPtrAddr(xptr_dst); if (obj_dst == NULL) { Rf_error("`ptr_dst` is a pointer to NULL"); } if (obj_dst->release != NULL) { Rf_error("`ptr_dst` is a valid struct ArrowArrayStream"); } array_stream_export(array_stream_xptr, obj_dst); // Remove SEXP dependencies (if important they are kept alive by array_stream_export) R_SetExternalPtrProtected(array_stream_xptr, R_NilValue); R_SetExternalPtrTag(array_stream_xptr, R_NilValue); UNPROTECT(1); return R_NilValue; } SEXP nanoarrow_c_pointer_set_protected(SEXP ptr_src, SEXP protected_sexp) { if (R_ExternalPtrProtected(ptr_src) != R_NilValue) { Rf_error("External pointer protected value has already been set"); } R_SetExternalPtrProtected(ptr_src, protected_sexp); return R_NilValue; } nanoarrow/src/materialize_chr.h0000644000176200001440000000550314547575511016403 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_CHR_H_INCLUDED #define R_MATERIALIZE_CHR_H_INCLUDED #include #include #include #include #include "materialize_common.h" #include "nanoarrow.h" static inline int nanoarrow_materialize_chr(struct RConverter* converter) { if (converter->src.array_view->array->dictionary != NULL) { return ENOTSUP; } struct ArrayViewSlice* src = &converter->src; struct VectorSlice* dst = &converter->dst; switch (src->array_view->storage_type) { case NANOARROW_TYPE_NA: for (R_xlen_t i = 0; i < dst->length; i++) { SET_STRING_ELT(dst->vec_sexp, dst->offset + i, NA_STRING); } return NANOARROW_OK; case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT64: { char buf[64]; for (R_xlen_t i = 0; i < dst->length; i++) { if (ArrowArrayViewIsNull(src->array_view, src->offset + i)) { SET_STRING_ELT(dst->vec_sexp, dst->offset + i, NA_STRING); } else { int n_chars = snprintf(buf, sizeof(buf), "%" PRId64, ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i)); SET_STRING_ELT(dst->vec_sexp, dst->offset + i, Rf_mkCharLenCE(buf, n_chars, CE_UTF8)); } } return NANOARROW_OK; } case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: break; default: return ENOTSUP; } struct ArrowStringView item; for (R_xlen_t i = 0; i < dst->length; i++) { if (ArrowArrayViewIsNull(src->array_view, src->offset + i)) { SET_STRING_ELT(dst->vec_sexp, dst->offset + i, NA_STRING); } else { item = ArrowArrayViewGetStringUnsafe(src->array_view, src->offset + i); SET_STRING_ELT(dst->vec_sexp, dst->offset + i, Rf_mkCharLenCE(item.data, (int)item.size_bytes, CE_UTF8)); } } return NANOARROW_OK; } #endif nanoarrow/src/materialize.c0000644000176200001440000004402714547575511015546 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "nanoarrow.h" #include "util.h" // Needed for the list_of materializer #include "convert.h" #include "materialize.h" #include "materialize_blob.h" #include "materialize_chr.h" #include "materialize_date.h" #include "materialize_dbl.h" #include "materialize_difftime.h" #include "materialize_int.h" #include "materialize_int64.h" #include "materialize_lgl.h" #include "materialize_posixct.h" #include "materialize_unspecified.h" SEXP nanoarrow_alloc_type(enum VectorType vector_type, R_xlen_t len) { switch (vector_type) { case VECTOR_TYPE_LGL: return Rf_allocVector(LGLSXP, len); case VECTOR_TYPE_INT: return Rf_allocVector(INTSXP, len); case VECTOR_TYPE_DBL: return Rf_allocVector(REALSXP, len); case VECTOR_TYPE_CHR: return Rf_allocVector(STRSXP, len); default: return R_NilValue; } } // A version of Rf_getAttrib(x, sym) != R_NilValue that never // expands the row.names attribute static int has_attrib_safe(SEXP x, SEXP sym) { for (SEXP atts = ATTRIB(x); atts != R_NilValue; atts = CDR(atts)) { if (TAG(atts) == sym) return TRUE; } return FALSE; } R_xlen_t nanoarrow_data_frame_size(SEXP x) { if (Rf_length(x) > 0) { // This both avoids materializing the row.names attribute and // makes this work with struct-style vctrs that don't have a // row.names attribute but that always have one or more element return Rf_xlength(VECTOR_ELT(x, 0)); } else { // Since ALTREP was introduced, materializing the row.names attribute is // usually deferred such that values in the form c(NA, -nrow), 1:nrow, or // as.character(1:nrow) are never actually computed when the length is // taken. return Rf_xlength(Rf_getAttrib(x, R_RowNamesSymbol)); } } void nanoarrow_set_rownames(SEXP x, R_xlen_t len) { // If len fits in the integer range, we can use the c(NA, -nrow) // shortcut for the row.names attribute. R expands this when // the actual value is accessed (even from Rf_getAttrib()). // If len does not fit in the integer range, we need // as.character(seq_len(nrow)) (which returns a deferred ALTREP // string conversion of an ALTREP sequence in recent R). Manipulating // data frames with more than INT_MAX rows is not supported in most // places but column access still works. if (len <= INT_MAX) { SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2)); INTEGER(rownames)[0] = NA_INTEGER; INTEGER(rownames)[1] = (int)(-len); Rf_setAttrib(x, R_RowNamesSymbol, rownames); UNPROTECT(1); } else { SEXP length_dbl = PROTECT(Rf_ScalarReal((double)len)); SEXP seq_len_symbol = PROTECT(Rf_install("seq_len")); SEXP seq_len_call = PROTECT(Rf_lang2(seq_len_symbol, length_dbl)); SEXP rownames_call = PROTECT(Rf_lang2(R_AsCharacterSymbol, seq_len_call)); Rf_setAttrib(x, R_RowNamesSymbol, Rf_eval(rownames_call, R_BaseNamespace)); UNPROTECT(4); } } int nanoarrow_ptype_is_data_frame(SEXP ptype) { return Rf_isObject(ptype) && TYPEOF(ptype) == VECSXP && (Rf_inherits(ptype, "data.frame") || (Rf_xlength(ptype) > 0 && has_attrib_safe(ptype, R_NamesSymbol))); } SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) { SEXP result; if (Rf_isObject(ptype)) { // There may be a more accurate test that more precisely captures the case // where a user has specified a valid ptype that doesn't work in a preallocate // + fill conversion. if (Rf_inherits(ptype, "factor")) { SEXP levels = Rf_getAttrib(ptype, R_LevelsSymbol); if (Rf_length(levels) == 0) { Rf_error("Can't allocate ptype of class 'factor' with empty levels"); } } if (nanoarrow_ptype_is_data_frame(ptype)) { R_xlen_t num_cols = Rf_xlength(ptype); result = PROTECT(Rf_allocVector(VECSXP, num_cols)); for (R_xlen_t i = 0; i < num_cols; i++) { SET_VECTOR_ELT(result, i, nanoarrow_materialize_realloc(VECTOR_ELT(ptype, i), len)); } // Set attributes from ptype Rf_setAttrib(result, R_NamesSymbol, Rf_getAttrib(ptype, R_NamesSymbol)); Rf_copyMostAttrib(ptype, result); // ...except rownames if (Rf_inherits(ptype, "data.frame")) { nanoarrow_set_rownames(result, len); } } else { result = PROTECT(Rf_allocVector(TYPEOF(ptype), len)); Rf_copyMostAttrib(ptype, result); } } else { result = PROTECT(Rf_allocVector(TYPEOF(ptype), len)); } UNPROTECT(1); return result; } // Used in union building to pre-set all values to null static void fill_vec_with_nulls(SEXP x, R_xlen_t offset, R_xlen_t len) { if (nanoarrow_ptype_is_data_frame(x)) { for (R_xlen_t i = 0; i < Rf_xlength(x); i++) { fill_vec_with_nulls(VECTOR_ELT(x, i), offset, len); } return; } switch (TYPEOF(x)) { case RAWSXP: // Not perfect: raw() doesn't really support NA in R memset(RAW(x), 0, len * sizeof(char)); break; case LGLSXP: case INTSXP: { int* values = INTEGER(x); for (R_xlen_t i = 0; i < len; i++) { values[offset + i] = NA_INTEGER; } return; } case REALSXP: { double* values = REAL(x); for (R_xlen_t i = 0; i < len; i++) { values[offset + i] = NA_REAL; } return; } case CPLXSXP: { Rcomplex* values = COMPLEX(x); Rcomplex na_value; na_value.r = NA_REAL; na_value.i = NA_REAL; for (R_xlen_t i = 0; i < len; i++) { values[offset + i] = na_value; } return; } case STRSXP: for (R_xlen_t i = 0; i < len; i++) { SET_STRING_ELT(x, offset + i, NA_STRING); } return; case VECSXP: for (R_xlen_t i = 0; i < len; i++) { SET_VECTOR_ELT(x, offset + i, R_NilValue); } return; default: Rf_error("Attempt to fill vector with nulls with unsupported type"); } } static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) { if (nanoarrow_ptype_is_data_frame(dst)) { if (!nanoarrow_ptype_is_data_frame(x)) { Rf_error("Expected record-style vctr result but got non-record-style result"); } R_xlen_t x_len = nanoarrow_data_frame_size(x); if (len != x_len) { Rf_error("Unexpected data.frame row count in copy_vec_into()"); } // This does not currently consider column names (i.e., it blindly copies // by index). if (Rf_xlength(x) != Rf_xlength(dst)) { Rf_error("Unexpected data.frame column count in copy_vec_into()"); } for (R_xlen_t i = 0; i < Rf_xlength(x); i++) { copy_vec_into(VECTOR_ELT(x, i), VECTOR_ELT(dst, i), offset, len); } return; } else if (nanoarrow_ptype_is_data_frame(x)) { Rf_error("Expected non-record-style vctr result but got record-style result"); } if (TYPEOF(dst) != TYPEOF(x)) { Rf_error("Unexpected SEXP type in result copy_vec_into()"); } if (Rf_length(x) != len) { Rf_error("Unexpected length of result in copy_vec_into()"); } switch (TYPEOF(dst)) { case RAWSXP: memcpy(RAW(dst) + offset, RAW(x), len * sizeof(uint8_t)); break; case REALSXP: memcpy(REAL(dst) + offset, REAL(x), len * sizeof(double)); break; case INTSXP: case LGLSXP: memcpy(INTEGER(dst) + offset, INTEGER(x), len * sizeof(int)); break; case CPLXSXP: memcpy(COMPLEX(dst) + offset, COMPLEX(x), len * sizeof(Rcomplex)); break; case STRSXP: for (R_xlen_t i = 0; i < len; i++) { SET_STRING_ELT(dst, offset + i, STRING_ELT(x, i)); } break; case VECSXP: for (R_xlen_t i = 0; i < len; i++) { SET_VECTOR_ELT(dst, offset + i, VECTOR_ELT(x, i)); } break; default: Rf_error("Unhandled SEXP type in copy_vec_into()"); break; } } static int nanoarrow_materialize_other(struct RConverter* converter, SEXP converter_xptr) { // Ensure that we have a ptype SEXP to send in the call back to R if (converter->ptype_view.ptype == R_NilValue) { SEXP ptype = PROTECT(nanoarrow_alloc_type(converter->ptype_view.vector_type, 0)); converter->ptype_view.ptype = ptype; SET_VECTOR_ELT(R_ExternalPtrProtected(converter_xptr), 0, ptype); UNPROTECT(1); } // A unique situation where we don't want owning external pointers because we know // these are protected for the duration of our call into R and because we don't want // the underlying array to be released and invalidate the converter. The R code in // convert_fallback_other() takes care of ensuring an independent copy with the correct // offset/length. SEXP schema_xptr = PROTECT(R_MakeExternalPtr( (struct ArrowSchema*)converter->schema_view.schema, R_NilValue, R_NilValue)); Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); // We do need to set the protected member of the array external pointer to signal that // it is not an independent array (i.e., force a shallow copy). SEXP array_xptr = PROTECT(R_MakeExternalPtr( (struct ArrowArray*)converter->array_view.array, schema_xptr, converter_xptr)); Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array); SEXP offset_sexp = PROTECT( Rf_ScalarReal((double)(converter->src.array_view->offset + converter->src.offset))); SEXP length_sexp = PROTECT(Rf_ScalarReal((double)converter->src.length)); SEXP fun = PROTECT(Rf_install("convert_fallback_other")); SEXP call = PROTECT( Rf_lang5(fun, array_xptr, offset_sexp, length_sexp, converter->ptype_view.ptype)); SEXP result_src = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); // Copy the result into a slice of dst copy_vec_into(result_src, converter->dst.vec_sexp, converter->dst.offset, converter->dst.length); UNPROTECT(7); return NANOARROW_OK; } static int nanoarrow_materialize_data_frame(struct RConverter* converter, SEXP converter_xptr) { if (converter->ptype_view.vector_type != VECTOR_TYPE_DATA_FRAME) { return EINVAL; } // Make sure we error for dictionary types if (converter->src.array_view->array->dictionary != NULL) { return EINVAL; } SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3); switch (converter->array_view.storage_type) { case NANOARROW_TYPE_STRUCT: for (R_xlen_t i = 0; i < converter->n_children; i++) { converter->children[i]->src.offset = converter->src.offset; converter->children[i]->src.length = converter->src.length; converter->children[i]->dst.offset = converter->dst.offset; converter->children[i]->dst.length = converter->dst.length; SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i); NANOARROW_RETURN_NOT_OK( nanoarrow_materialize(converter->children[i], child_converter_xptr)); } return NANOARROW_OK; case NANOARROW_TYPE_DENSE_UNION: case NANOARROW_TYPE_SPARSE_UNION: // Pre-fill everything with nulls fill_vec_with_nulls(converter->dst.vec_sexp, converter->dst.offset, converter->dst.length); // Fill in the possibly non-null values one at a time for (R_xlen_t i = 0; i < converter->dst.length; i++) { int64_t child_index = ArrowArrayViewUnionChildIndex(&converter->array_view, converter->src.offset + i); int64_t child_offset = ArrowArrayViewUnionChildOffset(&converter->array_view, converter->src.offset + i); converter->children[child_index]->src.offset = child_offset; converter->children[child_index]->src.length = 1; converter->children[child_index]->dst.offset = converter->dst.offset + i; converter->children[child_index]->dst.length = 1; SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, child_index); NANOARROW_RETURN_NOT_OK(nanoarrow_materialize(converter->children[child_index], child_converter_xptr)); } return NANOARROW_OK; default: return ENOTSUP; } } static int materialize_list_element(struct RConverter* converter, SEXP converter_xptr, int64_t offset, int64_t length) { if (nanoarrow_converter_reserve(converter_xptr, length) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } converter->src.offset = offset; converter->src.length = length; converter->dst.offset = 0; converter->dst.length = length; if (nanoarrow_converter_materialize_n(converter_xptr, length) != length) { return EINVAL; } NANOARROW_RETURN_NOT_OK(nanoarrow_converter_finalize(converter_xptr)); return NANOARROW_OK; } static int nanoarrow_materialize_list_of(struct RConverter* converter, SEXP converter_xptr) { SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3); struct RConverter* child_converter = converter->children[0]; SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, 0); struct ArrayViewSlice* src = &converter->src; struct VectorSlice* dst = &converter->dst; // Make sure we error for dictionary types if (src->array_view->array->dictionary != NULL) { return EINVAL; } const int32_t* offsets = src->array_view->buffer_views[1].data.as_int32; const int64_t* large_offsets = src->array_view->buffer_views[1].data.as_int64; int64_t raw_src_offset = src->array_view->array->offset + src->offset; int64_t offset; int64_t length; switch (src->array_view->storage_type) { case NANOARROW_TYPE_NA: return NANOARROW_OK; case NANOARROW_TYPE_LIST: for (int64_t i = 0; i < dst->length; i++) { if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) { offset = offsets[raw_src_offset + i]; length = offsets[raw_src_offset + i + 1] - offset; NANOARROW_RETURN_NOT_OK(materialize_list_element( child_converter, child_converter_xptr, offset, length)); SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i, nanoarrow_converter_release_result(child_converter_xptr)); } } break; case NANOARROW_TYPE_LARGE_LIST: for (int64_t i = 0; i < dst->length; i++) { if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) { offset = large_offsets[raw_src_offset + i]; length = large_offsets[raw_src_offset + i + 1] - offset; NANOARROW_RETURN_NOT_OK(materialize_list_element( child_converter, child_converter_xptr, offset, length)); SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i, nanoarrow_converter_release_result(child_converter_xptr)); } } break; case NANOARROW_TYPE_FIXED_SIZE_LIST: length = src->array_view->layout.child_size_elements; for (int64_t i = 0; i < dst->length; i++) { if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) { offset = (raw_src_offset + i) * length; NANOARROW_RETURN_NOT_OK(materialize_list_element( child_converter, child_converter_xptr, offset, length)); SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i, nanoarrow_converter_release_result(child_converter_xptr)); } } break; default: return EINVAL; } return NANOARROW_OK; } static int nanoarrow_materialize_base(struct RConverter* converter, SEXP converter_xptr) { struct ArrayViewSlice* src = &converter->src; struct VectorSlice* dst = &converter->dst; struct MaterializeOptions* options = converter->options; // Make sure extension conversion calls into R if (converter->schema_view.extension_name.size_bytes > 0) { return nanoarrow_materialize_other(converter, converter_xptr); } switch (converter->ptype_view.vector_type) { case VECTOR_TYPE_UNSPECIFIED: return nanoarrow_materialize_unspecified(src, dst, options); case VECTOR_TYPE_LGL: return nanoarrow_materialize_lgl(src, dst, options); case VECTOR_TYPE_INT: return nanoarrow_materialize_int(src, dst, options); case VECTOR_TYPE_DBL: return nanoarrow_materialize_dbl(converter); case VECTOR_TYPE_CHR: return nanoarrow_materialize_chr(converter); case VECTOR_TYPE_POSIXCT: return nanoarrow_materialize_posixct(converter); case VECTOR_TYPE_DATE: return nanoarrow_materialize_date(converter); case VECTOR_TYPE_DIFFTIME: return nanoarrow_materialize_difftime(converter); case VECTOR_TYPE_INTEGER64: return nanoarrow_materialize_int64(src, dst, options); case VECTOR_TYPE_BLOB: return nanoarrow_materialize_blob(src, dst, options); case VECTOR_TYPE_LIST_OF: return nanoarrow_materialize_list_of(converter, converter_xptr); case VECTOR_TYPE_DATA_FRAME: return nanoarrow_materialize_data_frame(converter, converter_xptr); default: return nanoarrow_materialize_other(converter, converter_xptr); } } int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr) { int result = nanoarrow_materialize_base(converter, converter_xptr); if (result != NANOARROW_OK) { return nanoarrow_materialize_other(converter, converter_xptr); } else { return NANOARROW_OK; } } nanoarrow/src/version.c0000644000176200001440000000203014405371036014676 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "nanoarrow.h" SEXP nanoarrow_c_version(void) { return Rf_mkString(NANOARROW_VERSION); } SEXP nanoarrow_c_version_runtime(void) { return Rf_mkString(ArrowNanoarrowVersion()); } nanoarrow/src/convert_array.c0000644000176200001440000002426114547575511016114 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "nanoarrow.h" #include "altrep.h" #include "array.h" #include "array_view.h" #include "convert.h" #include "util.h" // The common case of converting a single array into a single vector is // defined here, powered by the generic conversion available via // convert.h but special-casing the common case of "just use the defaults" // (i.e., no need to allocate a zero-size ptype) and returning ALTREP // where possible. // borrow nanoarrow_c_infer_ptype() from infer_ptype.c SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); enum VectorType nanoarrow_infer_vector_type_array(SEXP array_xptr); // This calls nanoarrow::convert_array() (via a package helper) to try S3 // dispatch to find a convert_array() method (or error if there // isn't one) static SEXP call_convert_array(SEXP array_xptr, SEXP ptype_sexp) { SEXP fun = PROTECT(Rf_install("convert_fallback_other")); // offset/length don't need to be modified in this case SEXP call = PROTECT(Rf_lang5(fun, array_xptr, R_NilValue, R_NilValue, ptype_sexp)); SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); UNPROTECT(3); return result; } // Call stop_cant_convert_array(), which gives a more informative error // message than we can provide in a reasonable amount of C code here. // Because we opportunistically avoid allocating a ptype object, we might // have to allocate one here. static void call_stop_cant_convert_array(SEXP array_xptr, enum VectorType type, SEXP ptype_sexp) { SEXP fun = PROTECT(Rf_install("stop_cant_convert_array")); if (ptype_sexp == R_NilValue) { ptype_sexp = PROTECT(nanoarrow_alloc_type(type, 0)); SEXP call = PROTECT(Rf_lang3(fun, array_xptr, ptype_sexp)); Rf_eval(call, nanoarrow_ns_pkg); UNPROTECT(3); } else { SEXP call = PROTECT(Rf_lang3(fun, array_xptr, ptype_sexp)); Rf_eval(call, nanoarrow_ns_pkg); UNPROTECT(2); } } static SEXP convert_array_default(SEXP array_xptr, enum VectorType vector_type, SEXP ptype) { SEXP converter_xptr; if (ptype == R_NilValue) { converter_xptr = PROTECT(nanoarrow_converter_from_type(vector_type)); } else { converter_xptr = PROTECT(nanoarrow_converter_from_ptype(ptype)); } if (nanoarrow_converter_set_schema(converter_xptr, array_xptr_get_schema(array_xptr)) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } if (nanoarrow_converter_set_array(converter_xptr, array_xptr) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } if (nanoarrow_converter_materialize_all(converter_xptr) != NANOARROW_OK) { call_stop_cant_convert_array(array_xptr, vector_type, ptype); } if (nanoarrow_converter_finalize(converter_xptr) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } SEXP result = PROTECT(nanoarrow_converter_release_result(converter_xptr)); UNPROTECT(2); return result; } static SEXP convert_array_chr(SEXP array_xptr, SEXP ptype_sexp) { struct ArrowSchema* schema = schema_from_array_xptr(array_xptr); struct ArrowSchemaView schema_view; if (ArrowSchemaViewInit(&schema_view, schema, NULL) != NANOARROW_OK) { Rf_error("Invalid schema"); } // If array_xptr is an extension, use default conversion int source_can_altrep; switch (schema_view.type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: source_can_altrep = 1; break; default: source_can_altrep = 0; } if (!source_can_altrep || schema_view.extension_name.size_bytes > 0) { // Default conversion requires a ptype: resolve it if not already specified if (ptype_sexp == R_NilValue) { ptype_sexp = PROTECT(nanoarrow_c_infer_ptype(array_xptr_get_schema(array_xptr))); SEXP default_result = PROTECT(convert_array_default(array_xptr, VECTOR_TYPE_CHR, ptype_sexp)); UNPROTECT(2); return default_result; } else { return convert_array_default(array_xptr, VECTOR_TYPE_CHR, ptype_sexp); } } struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr); if (array->dictionary == NULL) { SEXP result = PROTECT(nanoarrow_c_make_altrep_chr(array_xptr)); if (result == R_NilValue) { call_stop_cant_convert_array(array_xptr, VECTOR_TYPE_CHR, R_NilValue); } UNPROTECT(1); return result; } else { return convert_array_default(array_xptr, VECTOR_TYPE_CHR, R_NilValue); } } SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp); static SEXP convert_array_data_frame(SEXP array_xptr, SEXP ptype_sexp) { struct ArrowSchema* schema = schema_from_array_xptr(array_xptr); struct ArrowSchemaView schema_view; if (ArrowSchemaViewInit(&schema_view, schema, NULL) != NANOARROW_OK) { Rf_error("Invalid schema"); } // If array_xptr is an extension, union, or the ptype isn't a data.frame // use convert/materialize convert behaviour. // Default conversion requires a ptype: resolve it if not already specified if (schema_view.storage_type != NANOARROW_TYPE_STRUCT || schema_view.extension_name.size_bytes > 0 || (ptype_sexp != R_NilValue && !Rf_inherits(ptype_sexp, "data.frame"))) { if (ptype_sexp == R_NilValue) { ptype_sexp = PROTECT(nanoarrow_c_infer_ptype(array_xptr_get_schema(array_xptr))); SEXP default_result = PROTECT(convert_array_default(array_xptr, VECTOR_TYPE_OTHER, ptype_sexp)); UNPROTECT(2); return default_result; } else { return convert_array_default(array_xptr, VECTOR_TYPE_DATA_FRAME, ptype_sexp); } } struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); R_xlen_t n_col = array->n_children; SEXP result = PROTECT(Rf_allocVector(VECSXP, n_col)); if (ptype_sexp == R_NilValue) { SEXP result_names = PROTECT(Rf_allocVector(STRSXP, n_col)); for (R_xlen_t i = 0; i < n_col; i++) { SEXP child_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i)); SET_VECTOR_ELT(result, i, nanoarrow_c_convert_array(child_xptr, R_NilValue)); UNPROTECT(1); struct ArrowSchema* schema = schema_from_array_xptr(child_xptr); if (schema->name != NULL) { SET_STRING_ELT(result_names, i, Rf_mkCharCE(schema->name, CE_UTF8)); } else { SET_STRING_ELT(result_names, i, Rf_mkChar("")); } } Rf_setAttrib(result, R_NamesSymbol, result_names); Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame); UNPROTECT(1); } else { if (n_col != Rf_xlength(ptype_sexp)) { Rf_error("Expected data.frame() ptype with %ld column(s) but found %ld column(s)", (long)n_col, (long)Rf_xlength(ptype_sexp)); } for (R_xlen_t i = 0; i < n_col; i++) { SEXP child_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i)); SEXP child_ptype = VECTOR_ELT(ptype_sexp, i); SET_VECTOR_ELT(result, i, nanoarrow_c_convert_array(child_xptr, child_ptype)); UNPROTECT(1); } Rf_setAttrib(result, R_NamesSymbol, Rf_getAttrib(ptype_sexp, R_NamesSymbol)); Rf_copyMostAttrib(ptype_sexp, result); } if (Rf_inherits(result, "data.frame")) { nanoarrow_set_rownames(result, array->length); } UNPROTECT(1); return result; } SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp) { // See if we can skip any ptype resolution at all if (ptype_sexp == R_NilValue) { enum VectorType vector_type = nanoarrow_infer_vector_type_array(array_xptr); switch (vector_type) { case VECTOR_TYPE_LGL: case VECTOR_TYPE_INT: case VECTOR_TYPE_DBL: return convert_array_default(array_xptr, vector_type, R_NilValue); case VECTOR_TYPE_CHR: return convert_array_chr(array_xptr, ptype_sexp); case VECTOR_TYPE_DATA_FRAME: return convert_array_data_frame(array_xptr, R_NilValue); default: break; } // Otherwise, resolve the ptype and use it (this will also error // for ptypes that can't be resolved) ptype_sexp = PROTECT(nanoarrow_c_infer_ptype(array_xptr_get_schema(array_xptr))); SEXP result = nanoarrow_c_convert_array(array_xptr, ptype_sexp); UNPROTECT(1); return result; } // Handle some S3 objects internally to avoid S3 dispatch // (e.g., when looping over a data frame with a lot of columns) if (Rf_isObject(ptype_sexp)) { if (nanoarrow_ptype_is_data_frame(ptype_sexp)) { return convert_array_data_frame(array_xptr, ptype_sexp); } else if (Rf_inherits(ptype_sexp, "vctrs_unspecified") || Rf_inherits(ptype_sexp, "blob") || Rf_inherits(ptype_sexp, "vctrs_list_of") || Rf_inherits(ptype_sexp, "Date") || Rf_inherits(ptype_sexp, "hms") || Rf_inherits(ptype_sexp, "POSIXct") || Rf_inherits(ptype_sexp, "difftime") || Rf_inherits(ptype_sexp, "integer64")) { return convert_array_default(array_xptr, VECTOR_TYPE_UNINITIALIZED, ptype_sexp); } else { return call_convert_array(array_xptr, ptype_sexp); } } // If we're here, these are non-S3 objects switch (TYPEOF(ptype_sexp)) { case LGLSXP: return convert_array_default(array_xptr, VECTOR_TYPE_LGL, ptype_sexp); case INTSXP: return convert_array_default(array_xptr, VECTOR_TYPE_INT, ptype_sexp); case REALSXP: return convert_array_default(array_xptr, VECTOR_TYPE_DBL, ptype_sexp); case STRSXP: return convert_array_chr(array_xptr, ptype_sexp); default: return call_convert_array(array_xptr, ptype_sexp); } } nanoarrow/src/util.c0000644000176200001440000000476114547575511014216 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "util.h" SEXP nanoarrow_ns_pkg = NULL; SEXP nanoarrow_cls_array = NULL; SEXP nanoarrow_cls_altrep_chr = NULL; SEXP nanoarrow_cls_array_view = NULL; SEXP nanoarrow_cls_data_frame = NULL; SEXP nanoarrow_cls_schema = NULL; SEXP nanoarrow_cls_array_stream = NULL; SEXP nanoarrow_cls_buffer = NULL; void nanoarrow_init_cached_sexps(void) { SEXP nanoarrow_str = PROTECT(Rf_mkString("nanoarrow")); nanoarrow_ns_pkg = PROTECT(R_FindNamespace(nanoarrow_str)); nanoarrow_cls_array = PROTECT(Rf_mkString("nanoarrow_array")); nanoarrow_cls_altrep_chr = PROTECT(Rf_mkString("nanoarrow::altrep_chr")); nanoarrow_cls_array_view = PROTECT(Rf_mkString("nanoarrow_array_view")); nanoarrow_cls_data_frame = PROTECT(Rf_mkString("data.frame")); nanoarrow_cls_schema = PROTECT(Rf_mkString("nanoarrow_schema")); nanoarrow_cls_array_stream = PROTECT(Rf_mkString("nanoarrow_array_stream")); nanoarrow_cls_buffer = PROTECT(Rf_mkString("nanoarrow_buffer")); R_PreserveObject(nanoarrow_ns_pkg); R_PreserveObject(nanoarrow_cls_array); R_PreserveObject(nanoarrow_cls_altrep_chr); R_PreserveObject(nanoarrow_cls_array_view); R_PreserveObject(nanoarrow_cls_data_frame); R_PreserveObject(nanoarrow_cls_schema); R_PreserveObject(nanoarrow_cls_array_stream); R_PreserveObject(nanoarrow_cls_buffer); UNPROTECT(9); } SEXP nanoarrow_c_preserved_count(void) { return Rf_ScalarReal((double)nanoarrow_preserved_count()); } SEXP nanoarrow_c_preserved_empty(void) { return Rf_ScalarReal((double)nanoarrow_preserved_empty()); } SEXP nanoarrow_c_preserve_and_release_on_other_thread(SEXP obj) { nanoarrow_preserve_and_release_on_other_thread(obj); return R_NilValue; } nanoarrow/src/materialize_lgl.h0000644000176200001440000000603114502402562016364 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_LGL_H_INCLUDED #define R_MATERIALIZE_LGL_H_INCLUDED #include #include #include "materialize_common.h" #include "nanoarrow.h" static int nanoarrow_materialize_lgl(struct ArrayViewSlice* src, struct VectorSlice* dst, struct MaterializeOptions* options) { if (src->array_view->array->dictionary != NULL) { return ENOTSUP; } // True for all the types supported here const uint8_t* is_valid = src->array_view->buffer_views[0].data.as_uint8; const uint8_t* data_buffer = src->array_view->buffer_views[1].data.as_uint8; int64_t raw_src_offset = src->array_view->array->offset + src->offset; int* result = LOGICAL(dst->vec_sexp); // Fill the buffer switch (src->array_view->storage_type) { case NANOARROW_TYPE_NA: for (R_xlen_t i = 0; i < dst->length; i++) { result[dst->offset + i] = NA_LOGICAL; } break; case NANOARROW_TYPE_BOOL: ArrowBitsUnpackInt32(data_buffer, raw_src_offset, dst->length, result + dst->offset); // Set any nulls to NA_LOGICAL if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[dst->offset + i] = NA_LOGICAL; } } } break; case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_FLOAT: case NANOARROW_TYPE_DOUBLE: for (R_xlen_t i = 0; i < src->array_view->array->length; i++) { result[dst->offset + i] = ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i) != 0; } // Set any nulls to NA_LOGICAL if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[dst->offset + i] = NA_LOGICAL; } } } break; default: return EINVAL; } return NANOARROW_OK; } #endif nanoarrow/src/schema.h0000644000176200001440000000362714547575511014506 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_NANOARROW_SCHEMA_H_INCLUDED #define R_NANOARROW_SCHEMA_H_INCLUDED #include #include #include #include "nanoarrow.h" #include "util.h" // Returns an external pointer to a schema child. The returned pointer will keep its // parent alive: this is typically what you want when printing or performing a conversion, // where the borrowed external pointer is ephemeral. SEXP borrow_schema_child_xptr(SEXP schema_xptr, int64_t i); // Returns the underlying struct ArrowSchema* from an external pointer, // checking and erroring for invalid objects, pointers, and arrays, but // allowing for R_NilValue to signify a NULL return. static inline struct ArrowSchema* nullable_schema_from_xptr(SEXP schema_xptr) { if (schema_xptr == R_NilValue) { return NULL; } else { return nanoarrow_schema_from_xptr(schema_xptr); } } static inline void schema_export(SEXP schema_xptr, struct ArrowSchema* schema_copy) { int result = ArrowSchemaDeepCopy(nanoarrow_schema_from_xptr(schema_xptr), schema_copy); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaDeepCopy() failed"); } } #endif nanoarrow/src/schema.c0000644000176200001440000004766114547575511014507 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "nanoarrow.h" #include "schema.h" #include "util.h" SEXP nanoarrow_c_schema_init(SEXP type_id_sexp, SEXP nullable_sexp) { int type_id = INTEGER(type_id_sexp)[0]; SEXP schema_xptr = PROTECT(nanoarrow_schema_owning_xptr()); struct ArrowSchema* schema = nanoarrow_output_schema_from_xptr(schema_xptr); int result = ArrowSchemaInitFromType(schema, type_id); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaInitFromType() failed"); } result = ArrowSchemaSetName(schema, ""); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaSetName() failed"); } if (!LOGICAL(nullable_sexp)[0]) { schema->flags &= ~ARROW_FLAG_NULLABLE; } UNPROTECT(1); return schema_xptr; } SEXP nanoarrow_c_schema_init_date_time(SEXP type_id_sexp, SEXP time_unit_sexp, SEXP timezone_sexp, SEXP nullable_sexp) { int type_id = INTEGER(type_id_sexp)[0]; int time_unit = INTEGER(time_unit_sexp)[0]; const char* timezone = NULL; if (timezone_sexp != R_NilValue) { timezone = Rf_translateCharUTF8(STRING_ELT(timezone_sexp, 0)); } else { timezone = NULL; } SEXP schema_xptr = PROTECT(nanoarrow_schema_owning_xptr()); struct ArrowSchema* schema = nanoarrow_output_schema_from_xptr(schema_xptr); ArrowSchemaInit(schema); int result = ArrowSchemaSetTypeDateTime(schema, type_id, time_unit, timezone); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaSetTypeDateTime() failed"); } result = ArrowSchemaSetName(schema, ""); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaSetName() failed"); } if (!LOGICAL(nullable_sexp)[0]) { schema->flags &= ~ARROW_FLAG_NULLABLE; } UNPROTECT(1); return schema_xptr; } SEXP nanoarrow_c_schema_init_decimal(SEXP type_id_sexp, SEXP precision_sexp, SEXP scale_sexp, SEXP nullable_sexp) { int type_id = INTEGER(type_id_sexp)[0]; int precision = INTEGER(precision_sexp)[0]; int scale = INTEGER(scale_sexp)[0]; SEXP schema_xptr = PROTECT(nanoarrow_schema_owning_xptr()); struct ArrowSchema* schema = nanoarrow_output_schema_from_xptr(schema_xptr); ArrowSchemaInit(schema); int result = ArrowSchemaSetTypeDecimal(schema, type_id, precision, scale); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaSetTypeDecimal() failed"); } result = ArrowSchemaSetName(schema, ""); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaSetName() failed"); } if (!LOGICAL(nullable_sexp)[0]) { schema->flags &= ~ARROW_FLAG_NULLABLE; } UNPROTECT(1); return schema_xptr; } SEXP nanoarrow_c_schema_init_fixed_size(SEXP type_id_sexp, SEXP fixed_size_sexp, SEXP nullable_sexp) { int type_id = INTEGER(type_id_sexp)[0]; int fixed_size = INTEGER(fixed_size_sexp)[0]; SEXP schema_xptr = PROTECT(nanoarrow_schema_owning_xptr()); struct ArrowSchema* schema = nanoarrow_output_schema_from_xptr(schema_xptr); ArrowSchemaInit(schema); int result = ArrowSchemaSetTypeFixedSize(schema, type_id, fixed_size); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaSetTypeFixedSize() failed"); } result = ArrowSchemaSetName(schema, ""); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaSetName() failed"); } if (!LOGICAL(nullable_sexp)[0]) { schema->flags &= ~ARROW_FLAG_NULLABLE; } UNPROTECT(1); return schema_xptr; } static SEXP schema_metadata_to_list(const char* metadata) { if (metadata == NULL) { return R_NilValue; } struct ArrowMetadataReader reader; int result = ArrowMetadataReaderInit(&reader, metadata); if (result != NANOARROW_OK) { Rf_error("ArrowMetadataReaderInit() failed"); } SEXP names = PROTECT(Rf_allocVector(STRSXP, reader.remaining_keys)); SEXP values = PROTECT(Rf_allocVector(VECSXP, reader.remaining_keys)); struct ArrowStringView key; struct ArrowStringView value; R_xlen_t i = 0; while (reader.remaining_keys > 0) { result = ArrowMetadataReaderRead(&reader, &key, &value); if (result != NANOARROW_OK) { Rf_error("ArrowMetadataReaderRead() failed"); } SET_STRING_ELT(names, i, Rf_mkCharLenCE(key.data, (int)key.size_bytes, CE_UTF8)); SEXP value_raw = PROTECT(Rf_allocVector(RAWSXP, value.size_bytes)); memcpy(RAW(value_raw), value.data, value.size_bytes); SET_VECTOR_ELT(values, i, value_raw); UNPROTECT(1); i++; } Rf_setAttrib(values, R_NamesSymbol, names); UNPROTECT(2); return values; } static SEXP borrow_schema_xptr(struct ArrowSchema* schema, SEXP shelter) { SEXP schema_xptr = PROTECT(R_MakeExternalPtr(schema, R_NilValue, shelter)); Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); UNPROTECT(1); return schema_xptr; } SEXP borrow_schema_child_xptr(SEXP schema_xptr, int64_t i) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); return borrow_schema_xptr(schema->children[i], schema_xptr); } SEXP nanoarrow_c_schema_to_list(SEXP schema_xptr) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); const char* names[] = {"format", "name", "metadata", "flags", "children", "dictionary", ""}; SEXP result = PROTECT(Rf_mkNamed(VECSXP, names)); SEXP format_sexp = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(format_sexp, 0, Rf_mkCharCE(schema->format, CE_UTF8)); SET_VECTOR_ELT(result, 0, format_sexp); UNPROTECT(1); if (schema->name != NULL) { SEXP name_sexp = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(name_sexp, 0, Rf_mkCharCE(schema->name, CE_UTF8)); SET_VECTOR_ELT(result, 1, name_sexp); UNPROTECT(1); } else { SET_VECTOR_ELT(result, 1, R_NilValue); } SET_VECTOR_ELT(result, 2, schema_metadata_to_list(schema->metadata)); SET_VECTOR_ELT(result, 3, Rf_ScalarInteger((int)schema->flags)); if (schema->n_children > 0) { SEXP children_sexp = PROTECT(Rf_allocVector(VECSXP, schema->n_children)); SEXP children_names_sexp = PROTECT(Rf_allocVector(STRSXP, schema->n_children)); for (R_xlen_t i = 0; i < schema->n_children; i++) { SEXP child_xptr = PROTECT(borrow_schema_xptr(schema->children[i], schema_xptr)); SET_VECTOR_ELT(children_sexp, i, child_xptr); if (schema->children[i]->name != NULL) { SET_STRING_ELT(children_names_sexp, i, Rf_mkCharCE(schema->children[i]->name, CE_UTF8)); } else { SET_STRING_ELT(children_names_sexp, i, Rf_mkCharCE("", CE_UTF8)); } UNPROTECT(1); } Rf_setAttrib(children_sexp, R_NamesSymbol, children_names_sexp); SET_VECTOR_ELT(result, 4, children_sexp); UNPROTECT(2); } else { SET_VECTOR_ELT(result, 4, Rf_allocVector(VECSXP, schema->n_children)); } if (schema->dictionary != NULL) { SEXP dictionary_xptr = PROTECT(borrow_schema_xptr(schema->dictionary, schema_xptr)); SET_VECTOR_ELT(result, 5, dictionary_xptr); UNPROTECT(1); } else { SET_VECTOR_ELT(result, 5, R_NilValue); } UNPROTECT(1); return result; } static SEXP mkStringView(struct ArrowStringView* view) { if (view->data == NULL) { return R_NilValue; } SEXP chr = PROTECT(Rf_mkCharLenCE(view->data, (int)view->size_bytes, CE_UTF8)); SEXP str = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(str, 0, chr); UNPROTECT(2); return str; } SEXP nanoarrow_c_schema_parse(SEXP schema_xptr) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); struct ArrowSchemaView schema_view; struct ArrowError error; int status = ArrowSchemaViewInit(&schema_view, schema, &error); if (status != NANOARROW_OK) { Rf_error("ArrowSchemaViewInit(): %s", ArrowErrorMessage(&error)); } const char* names[] = { "type", "storage_type", "extension_name", "extension_metadata", "fixed_size", "decimal_bitwidth", "decimal_precision", "decimal_scale", "time_unit", "timezone", "union_type_ids", ""}; SEXP result = PROTECT(Rf_mkNamed(VECSXP, names)); SET_VECTOR_ELT(result, 0, Rf_mkString(ArrowTypeString((schema_view.type)))); SET_VECTOR_ELT(result, 1, Rf_mkString(ArrowTypeString((schema_view.storage_type)))); if (schema_view.extension_name.data != NULL) { SET_VECTOR_ELT(result, 2, mkStringView(&schema_view.extension_name)); } if (schema_view.extension_metadata.data != NULL) { SEXP metadata_sexp = PROTECT(Rf_allocVector(RAWSXP, schema_view.extension_metadata.size_bytes)); memcpy(RAW(metadata_sexp), schema_view.extension_metadata.data, schema_view.extension_metadata.size_bytes); SET_VECTOR_ELT(result, 3, metadata_sexp); UNPROTECT(1); } if (schema_view.type == NANOARROW_TYPE_FIXED_SIZE_LIST || schema_view.type == NANOARROW_TYPE_FIXED_SIZE_BINARY) { SET_VECTOR_ELT(result, 4, Rf_ScalarInteger(schema_view.fixed_size)); } if (schema_view.type == NANOARROW_TYPE_DECIMAL128 || schema_view.type == NANOARROW_TYPE_DECIMAL256) { SET_VECTOR_ELT(result, 5, Rf_ScalarInteger(schema_view.decimal_bitwidth)); SET_VECTOR_ELT(result, 6, Rf_ScalarInteger(schema_view.decimal_precision)); SET_VECTOR_ELT(result, 7, Rf_ScalarInteger(schema_view.decimal_scale)); } if (schema_view.type == NANOARROW_TYPE_TIME32 || schema_view.type == NANOARROW_TYPE_TIME64 || schema_view.type == NANOARROW_TYPE_TIMESTAMP || schema_view.type == NANOARROW_TYPE_DURATION) { SET_VECTOR_ELT(result, 8, Rf_mkString(ArrowTimeUnitString((schema_view.time_unit)))); } if (schema_view.type == NANOARROW_TYPE_TIMESTAMP) { SET_VECTOR_ELT(result, 9, Rf_mkString(schema_view.timezone)); } if (schema_view.type == NANOARROW_TYPE_DENSE_UNION || schema_view.type == NANOARROW_TYPE_SPARSE_UNION) { int8_t type_ids[128]; int num_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, type_ids); if (num_type_ids == -1 || num_type_ids > 127) { Rf_error("Invalid type IDs in union type: '%s'", schema_view.union_type_ids); } SEXP union_type_ids = PROTECT(Rf_allocVector(INTSXP, num_type_ids)); for (int i = 0; i < num_type_ids; i++) { INTEGER(union_type_ids)[i] = type_ids[i]; } SET_VECTOR_ELT(result, 10, union_type_ids); UNPROTECT(1); } UNPROTECT(1); return result; } SEXP nanoarrow_c_schema_format(SEXP schema_xptr, SEXP recursive_sexp) { int recursive = LOGICAL(recursive_sexp)[0]; // Be extra safe here (errors during formatting are hard to work around) if (!Rf_inherits(schema_xptr, "nanoarrow_schema")) { return Rf_mkString("[invalid: schema is not a nanoarrow_schema]"); } if (TYPEOF(schema_xptr) != EXTPTRSXP) { return Rf_mkString("[invalid: schema is not an external pointer]"); } struct ArrowSchema* schema = (struct ArrowSchema*)R_ExternalPtrAddr(schema_xptr); int64_t size_needed = ArrowSchemaToString(schema, NULL, 0, recursive != 0); if (size_needed >= INT_MAX) { size_needed = INT_MAX - 1; } // Using an SEXP because Rf_mkCharLenCE could jump SEXP formatted_sexp = PROTECT(Rf_allocVector(RAWSXP, size_needed + 1)); ArrowSchemaToString(schema, (char*)RAW(formatted_sexp), size_needed + 1, recursive != 0); SEXP result_sexp = PROTECT(Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(result_sexp, 0, Rf_mkCharLenCE((char*)RAW(formatted_sexp), (int)size_needed, CE_UTF8)); UNPROTECT(2); return result_sexp; } SEXP nanoarrow_c_schema_set_format(SEXP schema_mut_xptr, SEXP format_sexp) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_mut_xptr); if (TYPEOF(format_sexp) != STRSXP || Rf_length(format_sexp) != 1) { Rf_error("schema$format must be character(1)"); } const char* format = Rf_translateCharUTF8(STRING_ELT(format_sexp, 0)); if (ArrowSchemaSetFormat(schema, format) != NANOARROW_OK) { Rf_error("Error setting schema$format"); } return R_NilValue; } SEXP nanoarrow_c_schema_set_name(SEXP schema_mut_xptr, SEXP name_sexp) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_mut_xptr); int result; if (name_sexp == R_NilValue) { result = ArrowSchemaSetName(schema, NULL); } else { if (TYPEOF(name_sexp) != STRSXP || Rf_length(name_sexp) != 1) { Rf_error("schema$name must be NULL or character(1)"); } const char* name = Rf_translateCharUTF8(STRING_ELT(name_sexp, 0)); result = ArrowSchemaSetName(schema, name); } if (result != NANOARROW_OK) { Rf_error("Error setting schema$name"); } return R_NilValue; } static void finalize_buffer_xptr(SEXP buffer_xptr) { struct ArrowBuffer* buffer = (struct ArrowBuffer*)R_ExternalPtrAddr(buffer_xptr); if (buffer != NULL) { ArrowBufferReset(buffer); ArrowFree(buffer); } } static SEXP buffer_owning_xptr(void) { struct ArrowBuffer* buffer = (struct ArrowBuffer*)ArrowMalloc(sizeof(struct ArrowBuffer)); if (buffer == NULL) { Rf_error("Failed to allocate ArrowBuffer"); } SEXP buffer_xptr = PROTECT(R_MakeExternalPtr(buffer, R_NilValue, R_NilValue)); R_RegisterCFinalizer(buffer_xptr, &finalize_buffer_xptr); UNPROTECT(1); return buffer_xptr; } SEXP nanoarrow_c_schema_set_metadata(SEXP schema_mut_xptr, SEXP metadata_sexp) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_mut_xptr); int result; if (Rf_xlength(metadata_sexp) == 0) { result = ArrowSchemaSetMetadata(schema, NULL); if (result != NANOARROW_OK) { Rf_error("Failed to set schema$metadata"); } return R_NilValue; } // We need this to ensure buffer gets cleaned up amongst the potential longjmp // possibilities below. SEXP buffer_xptr = PROTECT(buffer_owning_xptr()); struct ArrowBuffer* buffer = (struct ArrowBuffer*)R_ExternalPtrAddr(buffer_xptr); result = ArrowMetadataBuilderInit(buffer, NULL); if (result != NANOARROW_OK) { Rf_error("ArrowMetadataBuilderInit() failed"); } SEXP metadata_names = PROTECT(Rf_getAttrib(metadata_sexp, R_NamesSymbol)); if (metadata_names == R_NilValue) { Rf_error("schema$metadata must be named"); } struct ArrowStringView key; struct ArrowStringView value; for (R_xlen_t i = 0; i < Rf_xlength(metadata_sexp); i++) { SEXP name_sexp = STRING_ELT(metadata_names, i); if (name_sexp == NA_STRING) { Rf_error("schema$metadata[[%ld]] must be named", (long)i + 1); } const void* vmax = vmaxget(); key = ArrowCharView(Rf_translateCharUTF8(name_sexp)); if (key.size_bytes == 0) { Rf_error("schema$metadata[[%ld]] must be named", (long)i + 1); } SEXP value_sexp = VECTOR_ELT(metadata_sexp, i); if (TYPEOF(value_sexp) == STRSXP && Rf_xlength(value_sexp) == 1) { SEXP value_chr = STRING_ELT(value_sexp, 0); if (value_chr == NA_STRING) { Rf_error("schema$metadata[[%ld]] must not be NA_character_", (long)i + 1); } value = ArrowCharView(Rf_translateCharUTF8(value_chr)); } else if (TYPEOF(value_sexp) == RAWSXP) { value.data = (const char*)RAW(value_sexp); value.size_bytes = Rf_xlength(value_sexp); } else { Rf_error("schema$metadata[[%ld]] must be character(1) or raw()", (long)i + 1); } result = ArrowMetadataBuilderAppend(buffer, key, value); if (result != NANOARROW_OK) { Rf_error("ArrowMetadataBuilderAppend() failed"); } vmaxset(vmax); } UNPROTECT(1); result = ArrowSchemaSetMetadata(schema, (const char*)buffer->data); ArrowBufferReset(buffer); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaSetMetadata() failed"); } UNPROTECT(1); return R_NilValue; } SEXP nanoarrow_c_schema_set_flags(SEXP schema_mut_xptr, SEXP flags_sexp) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_mut_xptr); if (TYPEOF(flags_sexp) != INTSXP || Rf_length(flags_sexp) != 1) { Rf_error("schema$flags must be integer(1)"); } int flags = INTEGER(flags_sexp)[0]; schema->flags = flags; return R_NilValue; } static void release_all_children(struct ArrowSchema* schema) { for (int64_t i = 0; i < schema->n_children; i++) { if (schema->children[i]->release != NULL) { schema->children[i]->release(schema->children[i]); } } } static void free_all_children(struct ArrowSchema* schema) { for (int64_t i = 0; i < schema->n_children; i++) { if (schema->children[i] != NULL) { ArrowFree(schema->children[i]); schema->children[i] = NULL; } } if (schema->children != NULL) { ArrowFree(schema->children); schema->children = NULL; } schema->n_children = 0; } SEXP nanoarrow_c_schema_set_children(SEXP schema_mut_xptr, SEXP children_sexp) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_mut_xptr); release_all_children(schema); if (Rf_xlength(children_sexp) == 0) { free_all_children(schema); return R_NilValue; } int result; if (Rf_xlength(children_sexp) != schema->n_children) { free_all_children(schema); result = ArrowSchemaAllocateChildren(schema, Rf_xlength(children_sexp)); if (result != NANOARROW_OK) { Rf_error("Error allocating schema$children of size %ld", (long)Rf_xlength(children_sexp)); } } // Names come from names(children) so that we can do // names(schema$children)[3] <- "something else" or // schema$children[[3]] <- some_unrelated_schema. On the flip // side, this makes schema$children[[3]]$name <- "something else" // have no effect, which is possibly confusing. SEXP children_names = PROTECT(Rf_getAttrib(children_sexp, R_NamesSymbol)); for (int64_t i = 0; i < schema->n_children; i++) { struct ArrowSchema* child = nanoarrow_schema_from_xptr(VECTOR_ELT(children_sexp, i)); result = ArrowSchemaDeepCopy(child, schema->children[i]); if (result != NANOARROW_OK) { Rf_error("Error copying new_values$children[[%ld]]", (long)i); } if (children_names != R_NilValue) { SEXP name_sexp = STRING_ELT(children_names, i); if (name_sexp == NA_STRING) { result = ArrowSchemaSetName(schema->children[i], ""); } else { const void* vmax = vmaxget(); const char* name = Rf_translateCharUTF8(name_sexp); result = ArrowSchemaSetName(schema->children[i], name); vmaxset(vmax); } } else { result = ArrowSchemaSetName(schema->children[i], ""); } if (result != NANOARROW_OK) { Rf_error("Error copying new_values$children[[%ld]]$name", (long)i); } } UNPROTECT(1); return R_NilValue; } SEXP nanoarrow_c_schema_set_dictionary(SEXP schema_mut_xptr, SEXP dictionary_xptr) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_mut_xptr); // If there's already a dictionary, make sure we release it if (schema->dictionary != NULL) { if (schema->dictionary->release != NULL) { schema->dictionary->release(schema->dictionary); } } if (dictionary_xptr == R_NilValue) { if (schema->dictionary != NULL) { ArrowFree(schema->dictionary); schema->dictionary = NULL; } } else { int result; if (schema->dictionary == NULL) { result = ArrowSchemaAllocateDictionary(schema); if (result != NANOARROW_OK) { Rf_error("Error allocating schema$dictionary"); } } struct ArrowSchema* dictionary = nanoarrow_schema_from_xptr(dictionary_xptr); result = ArrowSchemaDeepCopy(dictionary, schema->dictionary); if (result != NANOARROW_OK) { Rf_error("Error copying schema$dictionary"); } } return R_NilValue; } nanoarrow/src/nanoarrow.h0000644000176200001440000042152314556775567015270 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef NANOARROW_BUILD_ID_H_INCLUDED #define NANOARROW_BUILD_ID_H_INCLUDED #define NANOARROW_VERSION_MAJOR 0 #define NANOARROW_VERSION_MINOR 4 #define NANOARROW_VERSION_PATCH 0 #define NANOARROW_VERSION "0.4.0" #define NANOARROW_VERSION_INT \ (NANOARROW_VERSION_MAJOR * 10000 + NANOARROW_VERSION_MINOR * 100 + \ NANOARROW_VERSION_PATCH) #define NANOARROW_NAMESPACE RPkg #endif // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef NANOARROW_NANOARROW_TYPES_H_INCLUDED #define NANOARROW_NANOARROW_TYPES_H_INCLUDED #include #include #if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) #include #include #endif #ifdef __cplusplus extern "C" { #endif // Extra guard for versions of Arrow without the canonical guard #ifndef ARROW_FLAG_DICTIONARY_ORDERED /// \defgroup nanoarrow-arrow-cdata Arrow C Data interface /// /// The Arrow C Data (https://arrow.apache.org/docs/format/CDataInterface.html) /// and Arrow C Stream (https://arrow.apache.org/docs/format/CStreamInterface.html) /// interfaces are part of the /// Arrow Columnar Format specification /// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for /// documentation of these structures. /// /// @{ #ifndef ARROW_C_DATA_INTERFACE #define ARROW_C_DATA_INTERFACE #define ARROW_FLAG_DICTIONARY_ORDERED 1 #define ARROW_FLAG_NULLABLE 2 #define ARROW_FLAG_MAP_KEYS_SORTED 4 struct ArrowSchema { // Array type description const char* format; const char* name; const char* metadata; int64_t flags; int64_t n_children; struct ArrowSchema** children; struct ArrowSchema* dictionary; // Release callback void (*release)(struct ArrowSchema*); // Opaque producer-specific data void* private_data; }; struct ArrowArray { // Array data description int64_t length; int64_t null_count; int64_t offset; int64_t n_buffers; int64_t n_children; const void** buffers; struct ArrowArray** children; struct ArrowArray* dictionary; // Release callback void (*release)(struct ArrowArray*); // Opaque producer-specific data void* private_data; }; #endif // ARROW_C_DATA_INTERFACE #ifndef ARROW_C_STREAM_INTERFACE #define ARROW_C_STREAM_INTERFACE struct ArrowArrayStream { // Callback to get the stream type // (will be the same for all arrays in the stream). // // Return value: 0 if successful, an `errno`-compatible error code otherwise. // // If successful, the ArrowSchema must be released independently from the stream. int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); // Callback to get the next array // (if no error and the array is released, the stream has ended) // // Return value: 0 if successful, an `errno`-compatible error code otherwise. // // If successful, the ArrowArray must be released independently from the stream. int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); // Callback to get optional detailed error information. // This must only be called if the last stream operation failed // with a non-0 return code. // // Return value: pointer to a null-terminated character array describing // the last error, or NULL if no description is available. // // The returned pointer is only valid until the next operation on this stream // (including release). const char* (*get_last_error)(struct ArrowArrayStream*); // Release callback: release the stream's own resources. // Note that arrays returned by `get_next` must be individually released. void (*release)(struct ArrowArrayStream*); // Opaque producer-specific data void* private_data; }; #endif // ARROW_C_STREAM_INTERFACE #endif // ARROW_FLAG_DICTIONARY_ORDERED /// @} // Utility macros #define _NANOARROW_CONCAT(x, y) x##y #define _NANOARROW_MAKE_NAME(x, y) _NANOARROW_CONCAT(x, y) #define _NANOARROW_RETURN_NOT_OK_IMPL(NAME, EXPR) \ do { \ const int NAME = (EXPR); \ if (NAME) return NAME; \ } while (0) #define _NANOARROW_CHECK_RANGE(x_, min_, max_) \ NANOARROW_RETURN_NOT_OK((x_ >= min_ && x_ <= max_) ? NANOARROW_OK : EINVAL) #define _NANOARROW_CHECK_UPPER_LIMIT(x_, max_) \ NANOARROW_RETURN_NOT_OK((x_ <= max_) ? NANOARROW_OK : EINVAL) #if defined(NANOARROW_DEBUG) #define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ do { \ const int NAME = (EXPR); \ if (NAME) { \ ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d\n* %s:%d", EXPR_STR, \ NAME, __FILE__, __LINE__); \ return NAME; \ } \ } while (0) #else #define _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL(NAME, EXPR, ERROR_PTR_EXPR, EXPR_STR) \ do { \ const int NAME = (EXPR); \ if (NAME) { \ ArrowErrorSet((ERROR_PTR_EXPR), "%s failed with errno %d", EXPR_STR, NAME); \ return NAME; \ } \ } while (0) #endif #if defined(NANOARROW_DEBUG) // For checking ArrowErrorSet() calls for valid printf format strings/arguments // If using mingw's c99-compliant printf, we need a different format-checking attribute #if defined(__USE_MINGW_ANSI_STDIO) && defined(__MINGW_PRINTF_FORMAT) #define NANOARROW_CHECK_PRINTF_ATTRIBUTE \ __attribute__((format(__MINGW_PRINTF_FORMAT, 2, 3))) #elif defined(__GNUC__) #define NANOARROW_CHECK_PRINTF_ATTRIBUTE __attribute__((format(printf, 2, 3))) #else #define NANOARROW_CHECK_PRINTF_ATTRIBUTE #endif // For checking calls to functions that return ArrowErrorCode #if defined(__GNUC__) && (__GNUC__ >= 4) #define NANOARROW_CHECK_RETURN_ATTRIBUTE __attribute__((warn_unused_result)) #elif defined(_MSC_VER) && (_MSC_VER >= 1700) #define NANOARROW_CHECK_RETURN_ATTRIBUTE _Check_return_ #else #define NANOARROW_CHECK_RETURN_ATTRIBUTE #endif #else #define NANOARROW_CHECK_RETURN_ATTRIBUTE #define NANOARROW_CHECK_PRINTF_ATTRIBUTE #endif #define NANOARROW_UNUSED(x) (void)(x) /// \brief Return code for success. /// \ingroup nanoarrow-errors #define NANOARROW_OK 0 /// \brief Represents an errno-compatible error code /// \ingroup nanoarrow-errors typedef int ArrowErrorCode; #if defined(NANOARROW_DEBUG) #define ArrowErrorCode NANOARROW_CHECK_RETURN_ATTRIBUTE ArrowErrorCode #endif /// \brief Flags supported by ArrowSchemaViewInit() /// \ingroup nanoarrow-schema-view #define NANOARROW_FLAG_ALL_SUPPORTED \ (ARROW_FLAG_DICTIONARY_ORDERED | ARROW_FLAG_NULLABLE | ARROW_FLAG_MAP_KEYS_SORTED) /// \brief Error type containing a UTF-8 encoded message. /// \ingroup nanoarrow-errors struct ArrowError { /// \brief A character buffer with space for an error message. char message[1024]; }; /// \brief Ensure an ArrowError is null-terminated by zeroing the first character. /// \ingroup nanoarrow-errors /// /// If error is NULL, this function does nothing. static inline void ArrowErrorInit(struct ArrowError* error) { if (error != NULL) { error->message[0] = '\0'; } } /// \brief Get the contents of an error /// \ingroup nanoarrow-errors /// /// If error is NULL, returns "", or returns the contents of the error message /// otherwise. static inline const char* ArrowErrorMessage(struct ArrowError* error) { if (error == NULL) { return ""; } else { return error->message; } } /// \brief Set the contents of an error from an existing null-terminated string /// \ingroup nanoarrow-errors /// /// If error is NULL, this function does nothing. static inline void ArrowErrorSetString(struct ArrowError* error, const char* src) { if (error == NULL) { return; } int64_t src_len = strlen(src); if (src_len >= ((int64_t)sizeof(error->message))) { memcpy(error->message, src, sizeof(error->message) - 1); error->message[sizeof(error->message) - 1] = '\0'; } else { memcpy(error->message, src, src_len); error->message[src_len] = '\0'; } } /// \brief Check the result of an expression and return it if not NANOARROW_OK /// \ingroup nanoarrow-errors #define NANOARROW_RETURN_NOT_OK(EXPR) \ _NANOARROW_RETURN_NOT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR) /// \brief Check the result of an expression and return it if not NANOARROW_OK, /// adding an auto-generated message to an ArrowError. /// \ingroup nanoarrow-errors /// /// This macro is used to ensure that functions that accept an ArrowError /// as input always set its message when returning an error code (e.g., when calling /// a nanoarrow function that does *not* accept ArrowError). #define NANOARROW_RETURN_NOT_OK_WITH_ERROR(EXPR, ERROR_EXPR) \ _NANOARROW_RETURN_NOT_OK_WITH_ERROR_IMPL( \ _NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, ERROR_EXPR, #EXPR) #if defined(NANOARROW_DEBUG) && !defined(NANOARROW_PRINT_AND_DIE) #define NANOARROW_PRINT_AND_DIE(VALUE, EXPR_STR) \ do { \ fprintf(stderr, "%s failed with code %d\n* %s:%d\n", EXPR_STR, (int)(VALUE), \ __FILE__, (int)__LINE__); \ abort(); \ } while (0) #endif #if defined(NANOARROW_DEBUG) #define _NANOARROW_ASSERT_OK_IMPL(NAME, EXPR, EXPR_STR) \ do { \ const int NAME = (EXPR); \ if (NAME) NANOARROW_PRINT_AND_DIE(NAME, EXPR_STR); \ } while (0) /// \brief Assert that an expression's value is NANOARROW_OK /// \ingroup nanoarrow-errors /// /// If nanoarrow was built in debug mode (i.e., defined(NANOARROW_DEBUG) is true), /// print a message to stderr and abort. If nanoarrow was built in release mode, /// this statement has no effect. You can customize fatal error behaviour /// be defining the NANOARROW_PRINT_AND_DIE macro before including nanoarrow.h /// This macro is provided as a convenience for users and is not used internally. #define NANOARROW_ASSERT_OK(EXPR) \ _NANOARROW_ASSERT_OK_IMPL(_NANOARROW_MAKE_NAME(errno_status_, __COUNTER__), EXPR, #EXPR) #define _NANOARROW_DCHECK_IMPL(EXPR, EXPR_STR) \ do { \ if (!(EXPR)) NANOARROW_PRINT_AND_DIE(-1, EXPR_STR); \ } while (0) #define NANOARROW_DCHECK(EXPR) _NANOARROW_DCHECK_IMPL(EXPR, #EXPR) #else #define NANOARROW_ASSERT_OK(EXPR) EXPR #define NANOARROW_DCHECK(EXPR) #endif static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst) { NANOARROW_DCHECK(src != NULL); NANOARROW_DCHECK(dst != NULL); memcpy(dst, src, sizeof(struct ArrowSchema)); src->release = NULL; } static inline void ArrowSchemaRelease(struct ArrowSchema* schema) { NANOARROW_DCHECK(schema != NULL); schema->release(schema); NANOARROW_DCHECK(schema->release == NULL); } static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst) { NANOARROW_DCHECK(src != NULL); NANOARROW_DCHECK(dst != NULL); memcpy(dst, src, sizeof(struct ArrowArray)); src->release = NULL; } static inline void ArrowArrayRelease(struct ArrowArray* array) { NANOARROW_DCHECK(array != NULL); array->release(array); NANOARROW_DCHECK(array->release == NULL); } static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, struct ArrowArrayStream* dst) { NANOARROW_DCHECK(src != NULL); NANOARROW_DCHECK(dst != NULL); memcpy(dst, src, sizeof(struct ArrowArrayStream)); src->release = NULL; } static inline const char* ArrowArrayStreamGetLastError( struct ArrowArrayStream* array_stream) { NANOARROW_DCHECK(array_stream != NULL); const char* value = array_stream->get_last_error(array_stream); if (value == NULL) { return ""; } else { return value; } } static inline ArrowErrorCode ArrowArrayStreamGetSchema( struct ArrowArrayStream* array_stream, struct ArrowSchema* out, struct ArrowError* error) { NANOARROW_DCHECK(array_stream != NULL); int result = array_stream->get_schema(array_stream, out); if (result != NANOARROW_OK && error != NULL) { ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); } return result; } static inline ArrowErrorCode ArrowArrayStreamGetNext( struct ArrowArrayStream* array_stream, struct ArrowArray* out, struct ArrowError* error) { NANOARROW_DCHECK(array_stream != NULL); int result = array_stream->get_next(array_stream, out); if (result != NANOARROW_OK && error != NULL) { ArrowErrorSetString(error, ArrowArrayStreamGetLastError(array_stream)); } return result; } static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream) { NANOARROW_DCHECK(array_stream != NULL); array_stream->release(array_stream); NANOARROW_DCHECK(array_stream->release == NULL); } static char _ArrowIsLittleEndian(void) { uint32_t check = 1; char first_byte; memcpy(&first_byte, &check, sizeof(char)); return first_byte; } /// \brief Arrow type enumerator /// \ingroup nanoarrow-utils /// /// These names are intended to map to the corresponding arrow::Type::type /// enumerator; however, the numeric values are specifically not equal /// (i.e., do not rely on numeric comparison). enum ArrowType { NANOARROW_TYPE_UNINITIALIZED = 0, NANOARROW_TYPE_NA = 1, NANOARROW_TYPE_BOOL, NANOARROW_TYPE_UINT8, NANOARROW_TYPE_INT8, NANOARROW_TYPE_UINT16, NANOARROW_TYPE_INT16, NANOARROW_TYPE_UINT32, NANOARROW_TYPE_INT32, NANOARROW_TYPE_UINT64, NANOARROW_TYPE_INT64, NANOARROW_TYPE_HALF_FLOAT, NANOARROW_TYPE_FLOAT, NANOARROW_TYPE_DOUBLE, NANOARROW_TYPE_STRING, NANOARROW_TYPE_BINARY, NANOARROW_TYPE_FIXED_SIZE_BINARY, NANOARROW_TYPE_DATE32, NANOARROW_TYPE_DATE64, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TYPE_TIME32, NANOARROW_TYPE_TIME64, NANOARROW_TYPE_INTERVAL_MONTHS, NANOARROW_TYPE_INTERVAL_DAY_TIME, NANOARROW_TYPE_DECIMAL128, NANOARROW_TYPE_DECIMAL256, NANOARROW_TYPE_LIST, NANOARROW_TYPE_STRUCT, NANOARROW_TYPE_SPARSE_UNION, NANOARROW_TYPE_DENSE_UNION, NANOARROW_TYPE_DICTIONARY, NANOARROW_TYPE_MAP, NANOARROW_TYPE_EXTENSION, NANOARROW_TYPE_FIXED_SIZE_LIST, NANOARROW_TYPE_DURATION, NANOARROW_TYPE_LARGE_STRING, NANOARROW_TYPE_LARGE_BINARY, NANOARROW_TYPE_LARGE_LIST, NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO }; /// \brief Get a string value of an enum ArrowType value /// \ingroup nanoarrow-utils /// /// Returns NULL for invalid values for type static inline const char* ArrowTypeString(enum ArrowType type); static inline const char* ArrowTypeString(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_NA: return "na"; case NANOARROW_TYPE_BOOL: return "bool"; case NANOARROW_TYPE_UINT8: return "uint8"; case NANOARROW_TYPE_INT8: return "int8"; case NANOARROW_TYPE_UINT16: return "uint16"; case NANOARROW_TYPE_INT16: return "int16"; case NANOARROW_TYPE_UINT32: return "uint32"; case NANOARROW_TYPE_INT32: return "int32"; case NANOARROW_TYPE_UINT64: return "uint64"; case NANOARROW_TYPE_INT64: return "int64"; case NANOARROW_TYPE_HALF_FLOAT: return "half_float"; case NANOARROW_TYPE_FLOAT: return "float"; case NANOARROW_TYPE_DOUBLE: return "double"; case NANOARROW_TYPE_STRING: return "string"; case NANOARROW_TYPE_BINARY: return "binary"; case NANOARROW_TYPE_FIXED_SIZE_BINARY: return "fixed_size_binary"; case NANOARROW_TYPE_DATE32: return "date32"; case NANOARROW_TYPE_DATE64: return "date64"; case NANOARROW_TYPE_TIMESTAMP: return "timestamp"; case NANOARROW_TYPE_TIME32: return "time32"; case NANOARROW_TYPE_TIME64: return "time64"; case NANOARROW_TYPE_INTERVAL_MONTHS: return "interval_months"; case NANOARROW_TYPE_INTERVAL_DAY_TIME: return "interval_day_time"; case NANOARROW_TYPE_DECIMAL128: return "decimal128"; case NANOARROW_TYPE_DECIMAL256: return "decimal256"; case NANOARROW_TYPE_LIST: return "list"; case NANOARROW_TYPE_STRUCT: return "struct"; case NANOARROW_TYPE_SPARSE_UNION: return "sparse_union"; case NANOARROW_TYPE_DENSE_UNION: return "dense_union"; case NANOARROW_TYPE_DICTIONARY: return "dictionary"; case NANOARROW_TYPE_MAP: return "map"; case NANOARROW_TYPE_EXTENSION: return "extension"; case NANOARROW_TYPE_FIXED_SIZE_LIST: return "fixed_size_list"; case NANOARROW_TYPE_DURATION: return "duration"; case NANOARROW_TYPE_LARGE_STRING: return "large_string"; case NANOARROW_TYPE_LARGE_BINARY: return "large_binary"; case NANOARROW_TYPE_LARGE_LIST: return "large_list"; case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: return "interval_month_day_nano"; default: return NULL; } } /// \brief Arrow time unit enumerator /// \ingroup nanoarrow-utils /// /// These names and values map to the corresponding arrow::TimeUnit::type /// enumerator. enum ArrowTimeUnit { NANOARROW_TIME_UNIT_SECOND = 0, NANOARROW_TIME_UNIT_MILLI = 1, NANOARROW_TIME_UNIT_MICRO = 2, NANOARROW_TIME_UNIT_NANO = 3 }; /// \brief Validation level enumerator /// \ingroup nanoarrow-array enum ArrowValidationLevel { /// \brief Do not validate buffer sizes or content. NANOARROW_VALIDATION_LEVEL_NONE = 0, /// \brief Validate buffer sizes that depend on array length but do not validate buffer /// sizes that depend on buffer data access. NANOARROW_VALIDATION_LEVEL_MINIMAL = 1, /// \brief Validate all buffer sizes, including those that require buffer data access, /// but do not perform any checks that are O(1) along the length of the buffers. NANOARROW_VALIDATION_LEVEL_DEFAULT = 2, /// \brief Validate all buffer sizes and all buffer content. This is useful in the /// context of untrusted input or input that may have been corrupted in transit. NANOARROW_VALIDATION_LEVEL_FULL = 3 }; /// \brief Get a string value of an enum ArrowTimeUnit value /// \ingroup nanoarrow-utils /// /// Returns NULL for invalid values for time_unit static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit); static inline const char* ArrowTimeUnitString(enum ArrowTimeUnit time_unit) { switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: return "s"; case NANOARROW_TIME_UNIT_MILLI: return "ms"; case NANOARROW_TIME_UNIT_MICRO: return "us"; case NANOARROW_TIME_UNIT_NANO: return "ns"; default: return NULL; } } /// \brief Functional types of buffers as described in the Arrow Columnar Specification /// \ingroup nanoarrow-array-view enum ArrowBufferType { NANOARROW_BUFFER_TYPE_NONE, NANOARROW_BUFFER_TYPE_VALIDITY, NANOARROW_BUFFER_TYPE_TYPE_ID, NANOARROW_BUFFER_TYPE_UNION_OFFSET, NANOARROW_BUFFER_TYPE_DATA_OFFSET, NANOARROW_BUFFER_TYPE_DATA }; /// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout /// \ingroup nanoarrow-array-view /// /// All currently supported types have 3 buffers or fewer; however, future types /// may involve a variable number of buffers (e.g., string view). These buffers /// will be represented by separate members of the ArrowArrayView or ArrowLayout. #define NANOARROW_MAX_FIXED_BUFFERS 3 /// \brief An non-owning view of a string /// \ingroup nanoarrow-utils struct ArrowStringView { /// \brief A pointer to the start of the string /// /// If size_bytes is 0, this value may be NULL. const char* data; /// \brief The size of the string in bytes, /// /// (Not including the null terminator.) int64_t size_bytes; }; /// \brief Return a view of a const C string /// \ingroup nanoarrow-utils static inline struct ArrowStringView ArrowCharView(const char* value); static inline struct ArrowStringView ArrowCharView(const char* value) { struct ArrowStringView out; out.data = value; if (value) { out.size_bytes = (int64_t)strlen(value); } else { out.size_bytes = 0; } return out; } union ArrowBufferViewData { const void* data; const int8_t* as_int8; const uint8_t* as_uint8; const int16_t* as_int16; const uint16_t* as_uint16; const int32_t* as_int32; const uint32_t* as_uint32; const int64_t* as_int64; const uint64_t* as_uint64; const double* as_double; const float* as_float; const char* as_char; }; /// \brief An non-owning view of a buffer /// \ingroup nanoarrow-utils struct ArrowBufferView { /// \brief A pointer to the start of the buffer /// /// If size_bytes is 0, this value may be NULL. union ArrowBufferViewData data; /// \brief The size of the buffer in bytes int64_t size_bytes; }; /// \brief Array buffer allocation and deallocation /// \ingroup nanoarrow-buffer /// /// Container for allocate, reallocate, and free methods that can be used /// to customize allocation and deallocation of buffers when constructing /// an ArrowArray. struct ArrowBufferAllocator { /// \brief Reallocate a buffer or return NULL if it cannot be reallocated uint8_t* (*reallocate)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, int64_t new_size); /// \brief Deallocate a buffer allocated by this allocator void (*free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); /// \brief Opaque data specific to the allocator void* private_data; }; /// \brief An owning mutable view of a buffer /// \ingroup nanoarrow-buffer struct ArrowBuffer { /// \brief A pointer to the start of the buffer /// /// If capacity_bytes is 0, this value may be NULL. uint8_t* data; /// \brief The size of the buffer in bytes int64_t size_bytes; /// \brief The capacity of the buffer in bytes int64_t capacity_bytes; /// \brief The allocator that will be used to reallocate and/or free the buffer struct ArrowBufferAllocator allocator; }; /// \brief An owning mutable view of a bitmap /// \ingroup nanoarrow-bitmap struct ArrowBitmap { /// \brief An ArrowBuffer to hold the allocated memory struct ArrowBuffer buffer; /// \brief The number of bits that have been appended to the bitmap int64_t size_bits; }; /// \brief A description of an arrangement of buffers /// \ingroup nanoarrow-utils /// /// Contains the minimum amount of information required to /// calculate the size of each buffer in an ArrowArray knowing only /// the length and offset of the array. struct ArrowLayout { /// \brief The function of each buffer enum ArrowBufferType buffer_type[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The data type of each buffer enum ArrowType buffer_data_type[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The size of an element each buffer or 0 if this size is variable or unknown int64_t element_size_bits[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The number of elements in the child array per element in this array for a /// fixed-size list int64_t child_size_elements; }; /// \brief A non-owning view of an ArrowArray /// \ingroup nanoarrow-array-view /// /// This data structure provides access to the values contained within /// an ArrowArray with fields provided in a more readily-extractible /// form. You can re-use an ArrowArrayView for multiple ArrowArrays /// with the same storage type, use it to represent a hypothetical /// ArrowArray that does not exist yet, or use it to validate the buffers /// of a future ArrowArray. struct ArrowArrayView { /// \brief The underlying ArrowArray or NULL if it has not been set or /// if the buffers in this ArrowArrayView are not backed by an ArrowArray. const struct ArrowArray* array; /// \brief The number of elements from the physical start of the buffers. int64_t offset; /// \brief The number of elements in this view. int64_t length; /// \brief A cached null count or -1 to indicate that this value is unknown. int64_t null_count; /// \brief The type used to store values in this array /// /// This type represents only the minimum required information to /// extract values from the array buffers (e.g., for a Date32 array, /// this value will be NANOARROW_TYPE_INT32). For dictionary-encoded /// arrays, this will be the index type. enum ArrowType storage_type; /// \brief The buffer types, strides, and sizes of this Array's buffers struct ArrowLayout layout; /// \brief This Array's buffers as ArrowBufferView objects struct ArrowBufferView buffer_views[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The number of children of this view int64_t n_children; /// \brief Pointers to views of this array's children struct ArrowArrayView** children; /// \brief Pointer to a view of this array's dictionary struct ArrowArrayView* dictionary; /// \brief Union type id to child index mapping /// /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer /// such that child_index == union_type_id_map[type_id] and /// type_id == union_type_id_map[128 + child_index]. This value may be /// NULL in the case where child_id == type_id. int8_t* union_type_id_map; }; // Used as the private data member for ArrowArrays allocated here and accessed // internally within inline ArrowArray* helpers. struct ArrowArrayPrivateData { // Holder for the validity buffer (or first buffer for union types, which are // the only type whose first buffer is not a valdiity buffer) struct ArrowBitmap bitmap; // Holder for additional buffers as required struct ArrowBuffer buffers[NANOARROW_MAX_FIXED_BUFFERS - 1]; // The array of pointers to buffers. This must be updated after a sequence // of appends to synchronize its values with the actual buffer addresses // (which may have ben reallocated uring that time) const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown enum ArrowType storage_type; // The buffer arrangement for the storage type struct ArrowLayout layout; // Flag to indicate if there are non-sequence union type ids. // In the future this could be replaced with a type id<->child mapping // to support constructing unions in append mode where type_id != child_index int8_t union_type_id_is_child_index; }; /// \brief A representation of an interval. /// \ingroup nanoarrow-utils struct ArrowInterval { /// \brief The type of interval being used enum ArrowType type; /// \brief The number of months represented by the interval int32_t months; /// \brief The number of days represented by the interval int32_t days; /// \brief The number of ms represented by the interval int32_t ms; /// \brief The number of ns represented by the interval int64_t ns; }; /// \brief Zero initialize an Interval with a given unit /// \ingroup nanoarrow-utils static inline void ArrowIntervalInit(struct ArrowInterval* interval, enum ArrowType type) { memset(interval, 0, sizeof(struct ArrowInterval)); interval->type = type; } /// \brief A representation of a fixed-precision decimal number /// \ingroup nanoarrow-utils /// /// This structure should be initialized with ArrowDecimalInit() once and /// values set using ArrowDecimalSetInt(), ArrowDecimalSetBytes128(), /// or ArrowDecimalSetBytes256(). struct ArrowDecimal { /// \brief An array of 64-bit integers of n_words length defined in native-endian order uint64_t words[4]; /// \brief The number of significant digits this decimal number can represent int32_t precision; /// \brief The number of digits after the decimal point. This can be negative. int32_t scale; /// \brief The number of words in the words array int n_words; /// \brief Cached value used by the implementation int high_word_index; /// \brief Cached value used by the implementation int low_word_index; }; /// \brief Initialize a decimal with a given set of type parameters /// \ingroup nanoarrow-utils static inline void ArrowDecimalInit(struct ArrowDecimal* decimal, int32_t bitwidth, int32_t precision, int32_t scale) { memset(decimal->words, 0, sizeof(decimal->words)); decimal->precision = precision; decimal->scale = scale; decimal->n_words = bitwidth / 8 / sizeof(uint64_t); if (_ArrowIsLittleEndian()) { decimal->low_word_index = 0; decimal->high_word_index = decimal->n_words - 1; } else { decimal->low_word_index = decimal->n_words - 1; decimal->high_word_index = 0; } } /// \brief Get a signed integer value of a sufficiently small ArrowDecimal /// /// This does not check if the decimal's precision sufficiently small to fit /// within the signed 64-bit integer range (A precision less than or equal /// to 18 is sufficiently small). static inline int64_t ArrowDecimalGetIntUnsafe(const struct ArrowDecimal* decimal) { return (int64_t)decimal->words[decimal->low_word_index]; } /// \brief Copy the bytes of this decimal into a sufficiently large buffer /// \ingroup nanoarrow-utils static inline void ArrowDecimalGetBytes(const struct ArrowDecimal* decimal, uint8_t* out) { memcpy(out, decimal->words, decimal->n_words * sizeof(uint64_t)); } /// \brief Returns 1 if the value represented by decimal is >= 0 or -1 otherwise /// \ingroup nanoarrow-utils static inline int64_t ArrowDecimalSign(const struct ArrowDecimal* decimal) { return 1 | ((int64_t)(decimal->words[decimal->high_word_index]) >> 63); } /// \brief Sets the integer value of this decimal /// \ingroup nanoarrow-utils static inline void ArrowDecimalSetInt(struct ArrowDecimal* decimal, int64_t value) { if (value < 0) { memset(decimal->words, 0xff, decimal->n_words * sizeof(uint64_t)); } else { memset(decimal->words, 0, decimal->n_words * sizeof(uint64_t)); } decimal->words[decimal->low_word_index] = value; } /// \brief Negate the value of this decimal in place /// \ingroup nanoarrow-utils static inline void ArrowDecimalNegate(struct ArrowDecimal* decimal) { uint64_t carry = 1; if (decimal->low_word_index == 0) { for (int i = 0; i < decimal->n_words; i++) { uint64_t elem = decimal->words[i]; elem = ~elem + carry; carry &= (elem == 0); decimal->words[i] = elem; } } else { for (int i = decimal->low_word_index; i >= 0; i--) { uint64_t elem = decimal->words[i]; elem = ~elem + carry; carry &= (elem == 0); decimal->words[i] = elem; } } } /// \brief Copy bytes from a buffer into this decimal /// \ingroup nanoarrow-utils static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal, const uint8_t* value) { memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t)); } #ifdef __cplusplus } #endif #endif // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef NANOARROW_H_INCLUDED #define NANOARROW_H_INCLUDED #include #include #include // If using CMake, optionally pass -DNANOARROW_NAMESPACE=MyNamespace which will set this // define in nanoarrow_config.h. If not, you can optionally #define NANOARROW_NAMESPACE // MyNamespace here. // This section remaps the non-prefixed symbols to the prefixed symbols so that // code written against this build can be used independent of the value of // NANOARROW_NAMESPACE. #ifdef NANOARROW_NAMESPACE #define NANOARROW_CAT(A, B) A##B #define NANOARROW_SYMBOL(A, B) NANOARROW_CAT(A, B) #define ArrowNanoarrowVersion NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersion) #define ArrowNanoarrowVersionInt \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowNanoarrowVersionInt) #define ArrowMalloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMalloc) #define ArrowRealloc NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowRealloc) #define ArrowFree NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowFree) #define ArrowBufferAllocatorDefault \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferAllocatorDefault) #define ArrowBufferDeallocator \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator) #define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet) #define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit) #define ArrowDecimalSetDigits NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalSetDigits) #define ArrowDecimalAppendDigitsToBuffer \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalAppendDigitsToBuffer) #define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit) #define ArrowSchemaInitFromType \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType) #define ArrowSchemaSetType NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetType) #define ArrowSchemaSetTypeStruct \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeStruct) #define ArrowSchemaSetTypeFixedSize \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize) #define ArrowSchemaSetTypeDecimal \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal) #define ArrowSchemaSetTypeDateTime \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime) #define ArrowSchemaSetTypeUnion \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeUnion) #define ArrowSchemaDeepCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaDeepCopy) #define ArrowSchemaSetFormat NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetFormat) #define ArrowSchemaSetName NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetName) #define ArrowSchemaSetMetadata \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetMetadata) #define ArrowSchemaAllocateChildren \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateChildren) #define ArrowSchemaAllocateDictionary \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaAllocateDictionary) #define ArrowMetadataReaderInit \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderInit) #define ArrowMetadataReaderRead \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataReaderRead) #define ArrowMetadataSizeOf NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataSizeOf) #define ArrowMetadataHasKey NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataHasKey) #define ArrowMetadataGetValue NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataGetValue) #define ArrowMetadataBuilderInit \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderInit) #define ArrowMetadataBuilderAppend \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderAppend) #define ArrowMetadataBuilderSet \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderSet) #define ArrowMetadataBuilderRemove \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowMetadataBuilderRemove) #define ArrowSchemaViewInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaViewInit) #define ArrowSchemaToString NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaToString) #define ArrowArrayInitFromType \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromType) #define ArrowArrayInitFromSchema \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromSchema) #define ArrowArrayInitFromArrayView \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) #define ArrowArrayInitFromArrayView \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayInitFromArrayView) #define ArrowArrayAllocateChildren \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateChildren) #define ArrowArrayAllocateDictionary \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayAllocateDictionary) #define ArrowArraySetValidityBitmap \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetValidityBitmap) #define ArrowArraySetBuffer NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArraySetBuffer) #define ArrowArrayReserve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayReserve) #define ArrowArrayFinishBuilding \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) #define ArrowArrayFinishBuildingDefault \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) #define ArrowArrayViewInitFromType \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) #define ArrowArrayViewInitFromSchema \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema) #define ArrowArrayViewAllocateChildren \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren) #define ArrowArrayViewAllocateDictionary \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary) #define ArrowArrayViewSetLength \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength) #define ArrowArrayViewSetArray \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArray) #define ArrowArrayViewSetArrayMinimal \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal) #define ArrowArrayViewValidate \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate) #define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset) #define ArrowBasicArrayStreamInit \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit) #define ArrowBasicArrayStreamSetArray \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamSetArray) #define ArrowBasicArrayStreamValidate \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamValidate) #endif #ifdef __cplusplus extern "C" { #endif /// \defgroup nanoarrow Nanoarrow C library /// /// Except where noted, objects are not thread-safe and clients should /// take care to serialize accesses to methods. /// /// Because this library is intended to be vendored, it provides full type /// definitions and encourages clients to stack or statically allocate /// where convenient. /// \defgroup nanoarrow-malloc Memory management /// /// Non-buffer members of a struct ArrowSchema and struct ArrowArray /// must be allocated using ArrowMalloc() or ArrowRealloc() and freed /// using ArrowFree() for schemas and arrays allocated here. Buffer members /// are allocated using an ArrowBufferAllocator. /// /// @{ /// \brief Allocate like malloc() void* ArrowMalloc(int64_t size); /// \brief Reallocate like realloc() void* ArrowRealloc(void* ptr, int64_t size); /// \brief Free a pointer allocated using ArrowMalloc() or ArrowRealloc(). void ArrowFree(void* ptr); /// \brief Return the default allocator /// /// The default allocator uses ArrowMalloc(), ArrowRealloc(), and /// ArrowFree(). struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void); /// \brief Create a custom deallocator /// /// Creates a buffer allocator with only a free method that can be used to /// attach a custom deallocator to an ArrowBuffer. This may be used to /// avoid copying an existing buffer that was not allocated using the /// infrastructure provided here (e.g., by an R or Python object). struct ArrowBufferAllocator ArrowBufferDeallocator( void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size), void* private_data); /// @} /// \brief Move the contents of an src ArrowSchema into dst and set src->release to NULL /// \ingroup nanoarrow-arrow-cdata static inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dst); /// \brief Call the release callback of an ArrowSchema /// \ingroup nanoarrow-arrow-cdata static inline void ArrowSchemaRelease(struct ArrowSchema* schema); /// \brief Move the contents of an src ArrowArray into dst and set src->release to NULL /// \ingroup nanoarrow-arrow-cdata static inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dst); /// \brief Call the release callback of an ArrowArray static inline void ArrowArrayRelease(struct ArrowArray* array); /// \brief Move the contents of an src ArrowArrayStream into dst and set src->release to /// NULL \ingroup nanoarrow-arrow-cdata static inline void ArrowArrayStreamMove(struct ArrowArrayStream* src, struct ArrowArrayStream* dst); /// \brief Call the get_schema callback of an ArrowArrayStream /// \ingroup nanoarrow-arrow-cdata /// /// Unlike the get_schema callback, this wrapper checks the return code /// and propagates the error reported by get_last_error into error. This /// makes it significantly less verbose to iterate over array streams /// using NANOARROW_RETURN_NOT_OK()-style error handling. static inline ArrowErrorCode ArrowArrayStreamGetSchema( struct ArrowArrayStream* array_stream, struct ArrowSchema* out, struct ArrowError* error); /// \brief Call the get_schema callback of an ArrowArrayStream /// \ingroup nanoarrow-arrow-cdata /// /// Unlike the get_next callback, this wrapper checks the return code /// and propagates the error reported by get_last_error into error. This /// makes it significantly less verbose to iterate over array streams /// using NANOARROW_RETURN_NOT_OK()-style error handling. static inline ArrowErrorCode ArrowArrayStreamGetNext( struct ArrowArrayStream* array_stream, struct ArrowArray* out, struct ArrowError* error); /// \brief Call the get_next callback of an ArrowArrayStream /// \ingroup nanoarrow-arrow-cdata /// /// Unlike the get_next callback, this function never returns NULL (i.e., its /// result is safe to use in printf-style error formatters). Null values from the /// original callback are reported as "". static inline const char* ArrowArrayStreamGetLastError( struct ArrowArrayStream* array_stream); /// \brief Call the release callback of an ArrowArrayStream static inline void ArrowArrayStreamRelease(struct ArrowArrayStream* array_stream); /// \defgroup nanoarrow-errors Error handling /// /// Functions generally return an errno-compatible error code; functions that /// need to communicate more verbose error information accept a pointer /// to an ArrowError. This can be stack or statically allocated. The /// content of the message is undefined unless an error code has been /// returned. If a nanoarrow function is passed a non-null ArrowError pointer, the /// ArrowError pointed to by the argument will be propagated with a /// null-terminated error message. It is safe to pass a NULL ArrowError anywhere /// in the nanoarrow API. /// /// Except where documented, it is generally not safe to continue after a /// function has returned a non-zero ArrowErrorCode. The NANOARROW_RETURN_NOT_OK and /// NANOARROW_ASSERT_OK macros are provided to help propagate errors. C++ clients can use /// the helpers provided in the nanoarrow.hpp header to facilitate using C++ idioms /// for memory management and error propgagtion. /// /// @{ /// \brief Set the contents of an error using printf syntax. /// /// If error is NULL, this function does nothing and returns NANOARROW_OK. NANOARROW_CHECK_PRINTF_ATTRIBUTE int ArrowErrorSet(struct ArrowError* error, const char* fmt, ...); /// @} /// \defgroup nanoarrow-utils Utility data structures /// /// @{ /// \brief Return a version string in the form "major.minor.patch" const char* ArrowNanoarrowVersion(void); /// \brief Return an integer that can be used to compare versions sequentially int ArrowNanoarrowVersionInt(void); /// \brief Initialize a description of buffer arrangements from a storage type void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type); /// \brief Create a string view from a null-terminated string static inline struct ArrowStringView ArrowCharView(const char* value); /// \brief Sets the integer value of an ArrowDecimal from a string ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, struct ArrowStringView value); /// \brief Get the integer value of an ArrowDecimal as string ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, struct ArrowBuffer* buffer); /// @} /// \defgroup nanoarrow-schema Creating schemas /// /// These functions allocate, copy, and destroy ArrowSchema structures /// /// @{ /// \brief Initialize an ArrowSchema /// /// Initializes the fields and release callback of schema_out. Caller /// is responsible for calling the schema->release callback if /// NANOARROW_OK is returned. void ArrowSchemaInit(struct ArrowSchema* schema); /// \brief Initialize an ArrowSchema from an ArrowType /// /// A convenience constructor for that calls ArrowSchemaInit() and /// ArrowSchemaSetType() for the common case of constructing an /// unparameterized type. The caller is responsible for calling the schema->release /// callback if NANOARROW_OK is returned. ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type); /// \brief Get a human-readable summary of a Schema /// /// Writes a summary of an ArrowSchema to out (up to n - 1 characters) /// and returns the number of characters required for the output if /// n were sufficiently large. If recursive is non-zero, the result will /// also include children. int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, char recursive); /// \brief Set the format field of a schema from an ArrowType /// /// Initializes the fields and release callback of schema_out. For /// NANOARROW_TYPE_LIST, NANOARROW_TYPE_LARGE_LIST, and /// NANOARROW_TYPE_MAP, the appropriate number of children are /// allocated, initialized, and named; however, the caller must /// ArrowSchemaSetType() on the preinitialized children. Schema must have been initialized /// using ArrowSchemaInit() or ArrowSchemaDeepCopy(). ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type); /// \brief Set the format field and initialize children of a struct schema /// /// The specified number of children are initialized; however, the caller is responsible /// for calling ArrowSchemaSetType() and ArrowSchemaSetName() on each child. /// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children); /// \brief Set the format field of a fixed-size schema /// /// Returns EINVAL for fixed_size <= 0 or for type that is not /// NANOARROW_TYPE_FIXED_SIZE_BINARY or NANOARROW_TYPE_FIXED_SIZE_LIST. /// For NANOARROW_TYPE_FIXED_SIZE_LIST, the appropriate number of children are /// allocated, initialized, and named; however, the caller must /// ArrowSchemaSetType() the first child. Schema must have been initialized using /// ArrowSchemaInit() or ArrowSchemaDeepCopy(). ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, enum ArrowType type, int32_t fixed_size); /// \brief Set the format field of a decimal schema /// /// Returns EINVAL for scale <= 0 or for type that is not /// NANOARROW_TYPE_DECIMAL128 or NANOARROW_TYPE_DECIMAL256. Schema must have been /// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, int32_t decimal_precision, int32_t decimal_scale); /// \brief Set the format field of a time, timestamp, or duration schema /// /// Returns EINVAL for type that is not /// NANOARROW_TYPE_TIME32, NANOARROW_TYPE_TIME64, /// NANOARROW_TYPE_TIMESTAMP, or NANOARROW_TYPE_DURATION. The /// timezone parameter must be NULL for a non-timestamp type. Schema must have been /// initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy(). ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, enum ArrowTimeUnit time_unit, const char* timezone); /// \brief Seet the format field of a union schema /// /// Returns EINVAL for a type that is not NANOARROW_TYPE_DENSE_UNION /// or NANOARROW_TYPE_SPARSE_UNION. The specified number of children are /// allocated, and initialized. ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, int64_t n_children); /// \brief Make a (recursive) copy of a schema /// /// Allocates and copies fields of schema into schema_out. ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, struct ArrowSchema* schema_out); /// \brief Copy format into schema->format /// /// schema must have been allocated using ArrowSchemaInitFromType() or /// ArrowSchemaDeepCopy(). ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format); /// \brief Copy name into schema->name /// /// schema must have been allocated using ArrowSchemaInitFromType() or /// ArrowSchemaDeepCopy(). ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name); /// \brief Copy metadata into schema->metadata /// /// schema must have been allocated using ArrowSchemaInitFromType() or /// ArrowSchemaDeepCopy. ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata); /// \brief Allocate the schema->children array /// /// Includes the memory for each child struct ArrowSchema. /// schema must have been allocated using ArrowSchemaInitFromType() or /// ArrowSchemaDeepCopy(). ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, int64_t n_children); /// \brief Allocate the schema->dictionary member /// /// schema must have been allocated using ArrowSchemaInitFromType() or /// ArrowSchemaDeepCopy(). ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema); /// @} /// \defgroup nanoarrow-metadata Create, read, and modify schema metadata /// /// @{ /// \brief Reader for key/value pairs in schema metadata /// /// The ArrowMetadataReader does not own any data and is only valid /// for the lifetime of the underlying metadata pointer. struct ArrowMetadataReader { /// \brief A metadata string from a schema->metadata field. const char* metadata; /// \brief The current offset into the metadata string int64_t offset; /// \brief The number of remaining keys int32_t remaining_keys; }; /// \brief Initialize an ArrowMetadataReader ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, const char* metadata); /// \brief Read the next key/value pair from an ArrowMetadataReader ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, struct ArrowStringView* key_out, struct ArrowStringView* value_out); /// \brief The number of bytes in in a key/value metadata string int64_t ArrowMetadataSizeOf(const char* metadata); /// \brief Check for a key in schema metadata char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key); /// \brief Extract a value from schema metadata /// /// If key does not exist in metadata, value_out is unmodified ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, struct ArrowStringView* value_out); /// \brief Initialize a builder for schema metadata from key/value pairs /// /// metadata can be an existing metadata string or NULL to initialize /// an empty metadata string. ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const char* metadata); /// \brief Append a key/value pair to a buffer containing serialized metadata ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, struct ArrowStringView key, struct ArrowStringView value); /// \brief Set a key/value pair to a buffer containing serialized metadata /// /// Ensures that the only entry for key in the metadata is set to value. /// This function maintains the existing position of (the first instance of) /// key if present in the data. ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, struct ArrowStringView key, struct ArrowStringView value); /// \brief Remove a key from a buffer containing serialized metadata ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, struct ArrowStringView key); /// @} /// \defgroup nanoarrow-schema-view Reading schemas /// /// @{ /// \brief A non-owning view of a parsed ArrowSchema /// /// Contains more readily extractable values than a raw ArrowSchema. /// Clients can stack or statically allocate this structure but are /// encouraged to use the provided getters to ensure forward /// compatibility. struct ArrowSchemaView { /// \brief A pointer to the schema represented by this view const struct ArrowSchema* schema; /// \brief The data type represented by the schema /// /// This value may be NANOARROW_TYPE_DICTIONARY if the schema has a /// non-null dictionary member; datetime types are valid values. /// This value will never be NANOARROW_TYPE_EXTENSION (see /// extension_name and/or extension_metadata to check for /// an extension type). enum ArrowType type; /// \brief The storage data type represented by the schema /// /// This value will never be NANOARROW_TYPE_DICTIONARY, NANOARROW_TYPE_EXTENSION /// or any datetime type. This value represents only the type required to /// interpret the buffers in the array. enum ArrowType storage_type; /// \brief The storage layout represented by the schema struct ArrowLayout layout; /// \brief The extension type name if it exists /// /// If the ARROW:extension:name key is present in schema.metadata, /// extension_name.data will be non-NULL. struct ArrowStringView extension_name; /// \brief The extension type metadata if it exists /// /// If the ARROW:extension:metadata key is present in schema.metadata, /// extension_metadata.data will be non-NULL. struct ArrowStringView extension_metadata; /// \brief Format fixed size parameter /// /// This value is set when parsing a fixed-size binary or fixed-size /// list schema; this value is undefined for other types. For a /// fixed-size binary schema this value is in bytes; for a fixed-size /// list schema this value refers to the number of child elements for /// each element of the parent. int32_t fixed_size; /// \brief Decimal bitwidth /// /// This value is set when parsing a decimal type schema; /// this value is undefined for other types. int32_t decimal_bitwidth; /// \brief Decimal precision /// /// This value is set when parsing a decimal type schema; /// this value is undefined for other types. int32_t decimal_precision; /// \brief Decimal scale /// /// This value is set when parsing a decimal type schema; /// this value is undefined for other types. int32_t decimal_scale; /// \brief Format time unit parameter /// /// This value is set when parsing a date/time type. The value is /// undefined for other types. enum ArrowTimeUnit time_unit; /// \brief Format timezone parameter /// /// This value is set when parsing a timestamp type and represents /// the timezone format parameter. This value points to /// data within the schema and is undefined for other types. const char* timezone; /// \brief Union type ids parameter /// /// This value is set when parsing a union type and represents /// type ids parameter. This value points to /// data within the schema and is undefined for other types. const char* union_type_ids; }; /// \brief Initialize an ArrowSchemaView ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, const struct ArrowSchema* schema, struct ArrowError* error); /// @} /// \defgroup nanoarrow-buffer Owning, growable buffers /// /// @{ /// \brief Initialize an ArrowBuffer /// /// Initialize a buffer with a NULL, zero-size buffer using the default /// buffer allocator. static inline void ArrowBufferInit(struct ArrowBuffer* buffer); /// \brief Set a newly-initialized buffer's allocator /// /// Returns EINVAL if the buffer has already been allocated. static inline ArrowErrorCode ArrowBufferSetAllocator( struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator); /// \brief Reset an ArrowBuffer /// /// Releases the buffer using the allocator's free method if /// the buffer's data member is non-null, sets the data member /// to NULL, and sets the buffer's size and capacity to 0. static inline void ArrowBufferReset(struct ArrowBuffer* buffer); /// \brief Move an ArrowBuffer /// /// Transfers the buffer data and lifecycle management to another /// address and resets buffer. static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst); /// \brief Grow or shrink a buffer to a given capacity /// /// When shrinking the capacity of the buffer, the buffer is only reallocated /// if shrink_to_fit is non-zero. Calling ArrowBufferResize() does not /// adjust the buffer's size member except to ensure that the invariant /// capacity >= size remains true. static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, int64_t new_capacity_bytes, char shrink_to_fit); /// \brief Ensure a buffer has at least a given additional capacity /// /// Ensures that the buffer has space to append at least /// additional_size_bytes, overallocating when required. static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, int64_t additional_size_bytes); /// \brief Write data to buffer and increment the buffer size /// /// This function does not check that buffer has the required capacity static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, int64_t size_bytes); /// \brief Write data to buffer and increment the buffer size /// /// This function writes and ensures that the buffer has the required capacity, /// possibly by reallocating the buffer. Like ArrowBufferReserve, this will /// overallocate when reallocation is required. static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, const void* data, int64_t size_bytes); /// \brief Write fill to buffer and increment the buffer size /// /// This function writes the specified number of fill bytes and /// ensures that the buffer has the required capacity, static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, uint8_t value, int64_t size_bytes); /// \brief Write an 8-bit integer to a buffer static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, int8_t value); /// \brief Write an unsigned 8-bit integer to a buffer static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, uint8_t value); /// \brief Write a 16-bit integer to a buffer static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, int16_t value); /// \brief Write an unsigned 16-bit integer to a buffer static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, uint16_t value); /// \brief Write a 32-bit integer to a buffer static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, int32_t value); /// \brief Write an unsigned 32-bit integer to a buffer static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, uint32_t value); /// \brief Write a 64-bit integer to a buffer static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, int64_t value); /// \brief Write an unsigned 64-bit integer to a buffer static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, uint64_t value); /// \brief Write a double to a buffer static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, double value); /// \brief Write a float to a buffer static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, float value); /// \brief Write an ArrowStringView to a buffer static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, struct ArrowStringView value); /// \brief Write an ArrowBufferView to a buffer static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, struct ArrowBufferView value); /// @} /// \defgroup nanoarrow-bitmap Bitmap utilities /// /// @{ /// \brief Extract a boolean value from a bitmap static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i); /// \brief Set a boolean value to a bitmap to true static inline void ArrowBitSet(uint8_t* bits, int64_t i); /// \brief Set a boolean value to a bitmap to false static inline void ArrowBitClear(uint8_t* bits, int64_t i); /// \brief Set a boolean value to a bitmap static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t value); /// \brief Set a boolean value to a range in a bitmap static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, uint8_t bits_are_set); /// \brief Count true values in a bitmap static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t i_from, int64_t i_to); /// \brief Extract int8 boolean values from a range in a bitmap static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, int64_t length, int8_t* out); /// \brief Extract int32 boolean values from a range in a bitmap static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, int64_t length, int32_t* out); /// \brief Initialize an ArrowBitmap /// /// Initialize the builder's buffer, empty its cache, and reset the size to zero static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap); /// \brief Move an ArrowBitmap /// /// Transfers the underlying buffer data and lifecycle management to another /// address and resets the bitmap. static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst); /// \brief Ensure a bitmap builder has at least a given additional capacity /// /// Ensures that the buffer has space to append at least /// additional_size_bits, overallocating when required. static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, int64_t additional_size_bits); /// \brief Grow or shrink a bitmap to a given capacity /// /// When shrinking the capacity of the bitmap, the bitmap is only reallocated /// if shrink_to_fit is non-zero. Calling ArrowBitmapResize() does not /// adjust the buffer's size member except when shrinking new_capacity_bits /// to a value less than the current number of bits in the bitmap. static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, int64_t new_capacity_bits, char shrink_to_fit); /// \brief Reserve space for and append zero or more of the same boolean value to a bitmap static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length); /// \brief Append zero or more of the same boolean value to a bitmap static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length); /// \brief Append boolean values encoded as int8_t to a bitmap /// /// The values must all be 0 or 1. static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, const int8_t* values, int64_t n_values); /// \brief Append boolean values encoded as int32_t to a bitmap /// /// The values must all be 0 or 1. static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, const int32_t* values, int64_t n_values); /// \brief Reset a bitmap builder /// /// Releases any memory held by buffer, empties the cache, and resets the size to zero static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap); /// @} /// \defgroup nanoarrow-array Creating arrays /// /// These functions allocate, copy, and destroy ArrowArray structures. /// Once an ArrowArray has been initialized via ArrowArrayInitFromType() /// or ArrowArrayInitFromSchema(), the caller is responsible for releasing /// it using the embedded release callback. /// /// @{ /// \brief Initialize the fields of an array /// /// Initializes the fields and release callback of array. Caller /// is responsible for calling the array->release callback if /// NANOARROW_OK is returned. ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, enum ArrowType storage_type); /// \brief Initialize the contents of an ArrowArray from an ArrowSchema /// /// Caller is responsible for calling the array->release callback if /// NANOARROW_OK is returned. ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, const struct ArrowSchema* schema, struct ArrowError* error); /// \brief Initialize the contents of an ArrowArray from an ArrowArrayView /// /// Caller is responsible for calling the array->release callback if /// NANOARROW_OK is returned. ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, const struct ArrowArrayView* array_view, struct ArrowError* error); /// \brief Allocate the array->children array /// /// Includes the memory for each child struct ArrowArray, /// whose members are marked as released and may be subsequently initialized /// with ArrowArrayInitFromType() or moved from an existing ArrowArray. /// schema must have been allocated using ArrowArrayInitFromType(). ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children); /// \brief Allocate the array->dictionary member /// /// Includes the memory for the struct ArrowArray, whose contents /// is marked as released and may be subsequently initialized /// with ArrowArrayInitFromType() or moved from an existing ArrowArray. /// array must have been allocated using ArrowArrayInitFromType() ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array); /// \brief Set the validity bitmap of an ArrowArray /// /// array must have been allocated using ArrowArrayInitFromType() void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap); /// \brief Set a buffer of an ArrowArray /// /// array must have been allocated using ArrowArrayInitFromType() ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, struct ArrowBuffer* buffer); /// \brief Get the validity bitmap of an ArrowArray /// /// array must have been allocated using ArrowArrayInitFromType() static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array); /// \brief Get a buffer of an ArrowArray /// /// array must have been allocated using ArrowArrayInitFromType() static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i); /// \brief Start element-wise appending to an ArrowArray /// /// Initializes any values needed to use ArrowArrayAppend*() functions. /// All element-wise appenders append by value and return EINVAL if the exact value /// cannot be represented by the underlying storage type. /// array must have been allocated using ArrowArrayInitFromType() static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array); /// \brief Reserve space for future appends /// /// For buffer sizes that can be calculated (i.e., not string data buffers or /// child array sizes for non-fixed-size arrays), recursively reserve space for /// additional elements. This is useful for reducing the number of reallocations /// that occur using the item-wise appenders. ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, int64_t additional_size_elements); /// \brief Append a null value to an array static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n); /// \brief Append an empty, non-null value to an array static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n); /// \brief Append a signed integer value to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type or EINVAL otherwise (e.g., value /// is outside the valid array range). static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, int64_t value); /// \brief Append an unsigned integer value to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type or EINVAL otherwise (e.g., value /// is outside the valid array range). static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, uint64_t value); /// \brief Append a double value to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type or EINVAL otherwise (e.g., value /// is outside the valid array range or there is an attempt to append /// a non-integer to an array with an integer storage type). static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, double value); /// \brief Append a string of bytes to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type, EOVERFLOW if appending value would overflow /// the offset type (e.g., if the data buffer would be larger than 2 GB for a /// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a /// binary, string, large binary, large string, or fixed-size binary array, or value is /// the wrong size for a fixed-size binary array). static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, struct ArrowBufferView value); /// \brief Append a string value to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type, EOVERFLOW if appending value would overflow /// the offset type (e.g., if the data buffer would be larger than 2 GB for a /// non-large string type), or EINVAL otherwise (e.g., the underlying array is not a /// string or large string array). static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, struct ArrowStringView value); /// \brief Append a Interval to an array /// /// Returns NANOARROW_OK if value can be exactly represented by /// the underlying storage type or EINVAL otherwise. static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, const struct ArrowInterval* value); /// \brief Append a decimal value to an array /// /// Returns NANOARROW_OK if array is a decimal array with the appropriate /// bitwidth or EINVAL otherwise. static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, const struct ArrowDecimal* value); /// \brief Finish a nested array element /// /// Appends a non-null element to the array based on the first child's current /// length. Returns NANOARROW_OK if the item was successfully added, EOVERFLOW /// if the child of a list or map array would exceed INT_MAX elements, or EINVAL /// if the underlying storage type is not a struct, list, large list, or fixed-size /// list, or if there was an attempt to add a struct or fixed-size list element where the /// length of the child array(s) did not match the expected length. static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array); /// \brief Finish a union array element /// /// Appends an element to the union type ids buffer and increments array->length. /// For sparse unions, up to one element is added to non type-id children. Returns /// EINVAL if the underlying storage type is not a union, if type_id is not valid, /// or if child sizes after appending are inconsistent. static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, int8_t type_id); /// \brief Shrink buffer capacity to the size required /// /// Also applies shrinking to any child arrays. array must have been allocated using /// ArrowArrayInitFromType static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array); /// \brief Finish building an ArrowArray /// /// Flushes any pointers from internal buffers that may have been reallocated /// into array->buffers and checks the actual size of the buffers /// against the expected size based on the final length. /// array must have been allocated using ArrowArrayInitFromType() ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, struct ArrowError* error); /// \brief Finish building an ArrowArray with explicit validation /// /// Finish building with an explicit validation level. This could perform less validation /// (i.e. NANOARROW_VALIDATION_LEVEL_NONE or NANOARROW_VALIDATION_LEVEL_MINIMAL) if CPU /// buffer data access is not possible or more validation (i.e., /// NANOARROW_VALIDATION_LEVEL_FULL) if buffer content was obtained from an untrusted or /// corruptible source. ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, enum ArrowValidationLevel validation_level, struct ArrowError* error); /// @} /// \defgroup nanoarrow-array-view Reading arrays /// /// These functions read and validate the contents ArrowArray structures. /// /// @{ /// \brief Initialize the contents of an ArrowArrayView void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, enum ArrowType storage_type); /// \brief Move an ArrowArrayView /// /// Transfers the ArrowArrayView data and lifecycle management to another /// address and resets the contents of src. static inline void ArrowArrayViewMove(struct ArrowArrayView* src, struct ArrowArrayView* dst); /// \brief Initialize the contents of an ArrowArrayView from an ArrowSchema ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, const struct ArrowSchema* schema, struct ArrowError* error); /// \brief Allocate the array_view->children array /// /// Includes the memory for each child struct ArrowArrayView ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, int64_t n_children); /// \brief Allocate array_view->dictionary ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view); /// \brief Set data-independent buffer sizes from length void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length); /// \brief Set buffer sizes and data pointers from an ArrowArray ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, const struct ArrowArray* array, struct ArrowError* error); /// \brief Set buffer sizes and data pointers from an ArrowArray except for those /// that require dereferencing buffer content. ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, const struct ArrowArray* array, struct ArrowError* error); /// \brief Performs checks on the content of an ArrowArrayView /// /// If using ArrowArrayViewSetArray() to back array_view with an ArrowArray, /// the buffer sizes and some content (fist and last offset) have already /// been validated at the "default" level. If setting the buffer pointers /// and sizes otherwise, you may wish to perform checks at a different level. See /// documentation for ArrowValidationLevel for the details of checks performed /// at each level. ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, enum ArrowValidationLevel validation_level, struct ArrowError* error); /// \brief Reset the contents of an ArrowArrayView and frees resources void ArrowArrayViewReset(struct ArrowArrayView* array_view); /// \brief Check for a null element in an ArrowArrayView static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, int64_t i); /// \brief Get the type id of a union array element static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, int64_t i); /// \brief Get the child index of a union array element static inline int8_t ArrowArrayViewUnionChildIndex( const struct ArrowArrayView* array_view, int64_t i); /// \brief Get the index to use into the relevant union child array static inline int64_t ArrowArrayViewUnionChildOffset( const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an integer /// /// This function does not check for null values, that values are actually integers, or /// that values are within a valid range for an int64. static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an unsigned integer /// /// This function does not check for null values, that values are actually integers, or /// that values are within a valid range for a uint64. static inline uint64_t ArrowArrayViewGetUIntUnsafe( const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as a double /// /// This function does not check for null values, or /// that values are within a valid range for a double. static inline double ArrowArrayViewGetDoubleUnsafe( const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an ArrowStringView /// /// This function does not check for null values. static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an ArrowBufferView /// /// This function does not check for null values. static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( const struct ArrowArrayView* array_view, int64_t i); /// \brief Get an element in an ArrowArrayView as an ArrowDecimal /// /// This function does not check for null values. The out parameter must /// be initialized with ArrowDecimalInit() with the proper parameters for this /// type before calling this for the first time. static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, int64_t i, struct ArrowDecimal* out); /// @} /// \defgroup nanoarrow-basic-array-stream Basic ArrowArrayStream implementation /// /// An implementation of an ArrowArrayStream based on a collection of /// zero or more previously-existing ArrowArray objects. Users should /// initialize and/or validate the contents before transferring the /// responsibility of the ArrowArrayStream elsewhere. /// /// @{ /// \brief Initialize an ArrowArrayStream backed by this implementation /// /// This function moves the ownership of schema to the array_stream. If /// this function returns NANOARROW_OK, the caller is responsible for /// releasing the ArrowArrayStream. ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, struct ArrowSchema* schema, int64_t n_arrays); /// \brief Set the ith ArrowArray in this ArrowArrayStream. /// /// array_stream must have been initialized with ArrowBasicArrayStreamInit(). /// This function move the ownership of array to the array_stream. i must /// be greater than zero and less than the value of n_arrays passed in /// ArrowBasicArrayStreamInit(). Callers are not required to fill all /// n_arrays members (i.e., n_arrays is a maximum bound). void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, struct ArrowArray* array); /// \brief Validate the contents of this ArrowArrayStream /// /// array_stream must have been initialized with ArrowBasicArrayStreamInit(). /// This function uses ArrowArrayStreamInitFromSchema() and ArrowArrayStreamSetArray() /// to validate the contents of the arrays. ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, struct ArrowError* error); /// @} // Undefine ArrowErrorCode, which may have been defined to annotate functions that return // it to warn for an unused result. #if defined(ArrowErrorCode) #undef ArrowErrorCode #endif // Inline function definitions #ifdef __cplusplus } #endif #endif // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef NANOARROW_BUFFER_INLINE_H_INCLUDED #define NANOARROW_BUFFER_INLINE_H_INCLUDED #include #include #include #ifdef __cplusplus extern "C" { #endif static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t new_capacity) { int64_t doubled_capacity = current_capacity * 2; if (doubled_capacity > new_capacity) { return doubled_capacity; } else { return new_capacity; } } static inline void ArrowBufferInit(struct ArrowBuffer* buffer) { buffer->data = NULL; buffer->size_bytes = 0; buffer->capacity_bytes = 0; buffer->allocator = ArrowBufferAllocatorDefault(); } static inline ArrowErrorCode ArrowBufferSetAllocator( struct ArrowBuffer* buffer, struct ArrowBufferAllocator allocator) { if (buffer->data == NULL) { buffer->allocator = allocator; return NANOARROW_OK; } else { return EINVAL; } } static inline void ArrowBufferReset(struct ArrowBuffer* buffer) { if (buffer->data != NULL) { buffer->allocator.free(&buffer->allocator, (uint8_t*)buffer->data, buffer->capacity_bytes); buffer->data = NULL; } buffer->capacity_bytes = 0; buffer->size_bytes = 0; } static inline void ArrowBufferMove(struct ArrowBuffer* src, struct ArrowBuffer* dst) { memcpy(dst, src, sizeof(struct ArrowBuffer)); src->data = NULL; ArrowBufferReset(src); } static inline ArrowErrorCode ArrowBufferResize(struct ArrowBuffer* buffer, int64_t new_capacity_bytes, char shrink_to_fit) { if (new_capacity_bytes < 0) { return EINVAL; } if (new_capacity_bytes > buffer->capacity_bytes || shrink_to_fit) { buffer->data = buffer->allocator.reallocate( &buffer->allocator, buffer->data, buffer->capacity_bytes, new_capacity_bytes); if (buffer->data == NULL && new_capacity_bytes > 0) { buffer->capacity_bytes = 0; buffer->size_bytes = 0; return ENOMEM; } buffer->capacity_bytes = new_capacity_bytes; } // Ensures that when shrinking that size <= capacity if (new_capacity_bytes < buffer->size_bytes) { buffer->size_bytes = new_capacity_bytes; } return NANOARROW_OK; } static inline ArrowErrorCode ArrowBufferReserve(struct ArrowBuffer* buffer, int64_t additional_size_bytes) { int64_t min_capacity_bytes = buffer->size_bytes + additional_size_bytes; if (min_capacity_bytes <= buffer->capacity_bytes) { return NANOARROW_OK; } return ArrowBufferResize( buffer, _ArrowGrowByFactor(buffer->capacity_bytes, min_capacity_bytes), 0); } static inline void ArrowBufferAppendUnsafe(struct ArrowBuffer* buffer, const void* data, int64_t size_bytes) { if (size_bytes > 0) { memcpy(buffer->data + buffer->size_bytes, data, size_bytes); buffer->size_bytes += size_bytes; } } static inline ArrowErrorCode ArrowBufferAppend(struct ArrowBuffer* buffer, const void* data, int64_t size_bytes) { NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); ArrowBufferAppendUnsafe(buffer, data, size_bytes); return NANOARROW_OK; } static inline ArrowErrorCode ArrowBufferAppendInt8(struct ArrowBuffer* buffer, int8_t value) { return ArrowBufferAppend(buffer, &value, sizeof(int8_t)); } static inline ArrowErrorCode ArrowBufferAppendUInt8(struct ArrowBuffer* buffer, uint8_t value) { return ArrowBufferAppend(buffer, &value, sizeof(uint8_t)); } static inline ArrowErrorCode ArrowBufferAppendInt16(struct ArrowBuffer* buffer, int16_t value) { return ArrowBufferAppend(buffer, &value, sizeof(int16_t)); } static inline ArrowErrorCode ArrowBufferAppendUInt16(struct ArrowBuffer* buffer, uint16_t value) { return ArrowBufferAppend(buffer, &value, sizeof(uint16_t)); } static inline ArrowErrorCode ArrowBufferAppendInt32(struct ArrowBuffer* buffer, int32_t value) { return ArrowBufferAppend(buffer, &value, sizeof(int32_t)); } static inline ArrowErrorCode ArrowBufferAppendUInt32(struct ArrowBuffer* buffer, uint32_t value) { return ArrowBufferAppend(buffer, &value, sizeof(uint32_t)); } static inline ArrowErrorCode ArrowBufferAppendInt64(struct ArrowBuffer* buffer, int64_t value) { return ArrowBufferAppend(buffer, &value, sizeof(int64_t)); } static inline ArrowErrorCode ArrowBufferAppendUInt64(struct ArrowBuffer* buffer, uint64_t value) { return ArrowBufferAppend(buffer, &value, sizeof(uint64_t)); } static inline ArrowErrorCode ArrowBufferAppendDouble(struct ArrowBuffer* buffer, double value) { return ArrowBufferAppend(buffer, &value, sizeof(double)); } static inline ArrowErrorCode ArrowBufferAppendFloat(struct ArrowBuffer* buffer, float value) { return ArrowBufferAppend(buffer, &value, sizeof(float)); } static inline ArrowErrorCode ArrowBufferAppendStringView(struct ArrowBuffer* buffer, struct ArrowStringView value) { return ArrowBufferAppend(buffer, value.data, value.size_bytes); } static inline ArrowErrorCode ArrowBufferAppendBufferView(struct ArrowBuffer* buffer, struct ArrowBufferView value) { return ArrowBufferAppend(buffer, value.data.data, value.size_bytes); } static inline ArrowErrorCode ArrowBufferAppendFill(struct ArrowBuffer* buffer, uint8_t value, int64_t size_bytes) { NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes)); memset(buffer->data + buffer->size_bytes, value, size_bytes); buffer->size_bytes += size_bytes; return NANOARROW_OK; } static const uint8_t _ArrowkBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; static const uint8_t _ArrowkFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; static const uint8_t _ArrowkPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; static const uint8_t _ArrowkTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; static const uint8_t _ArrowkBytePopcount[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; static inline int64_t _ArrowRoundUpToMultipleOf8(int64_t value) { return (value + 7) & ~((int64_t)7); } static inline int64_t _ArrowRoundDownToMultipleOf8(int64_t value) { return (value / 8) * 8; } static inline int64_t _ArrowBytesForBits(int64_t bits) { return (bits >> 3) + ((bits & 7) != 0); } static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) { out[0] = (word & 0x1) != 0; out[1] = (word & 0x2) != 0; out[2] = (word & 0x4) != 0; out[3] = (word & 0x8) != 0; out[4] = (word & 0x10) != 0; out[5] = (word & 0x20) != 0; out[6] = (word & 0x40) != 0; out[7] = (word & 0x80) != 0; } static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { out[0] = (word & 0x1) != 0; out[1] = (word & 0x2) != 0; out[2] = (word & 0x4) != 0; out[3] = (word & 0x8) != 0; out[4] = (word & 0x10) != 0; out[5] = (word & 0x20) != 0; out[6] = (word & 0x40) != 0; out[7] = (word & 0x80) != 0; } static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | ((values[7] + 0x7f) & 0x80)); } static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) { *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | ((values[7] + 0x7f) & 0x80)); } static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) { return (bits[i >> 3] >> (i & 0x07)) & 1; } static inline void ArrowBitsUnpackInt8(const uint8_t* bits, int64_t start_offset, int64_t length, int8_t* out) { if (length == 0) { return; } const int64_t i_begin = start_offset; const int64_t i_end = start_offset + length; const int64_t i_last_valid = i_end - 1; const int64_t bytes_begin = i_begin / 8; const int64_t bytes_last_valid = i_last_valid / 8; if (bytes_begin == bytes_last_valid) { for (int i = 0; i < length; i++) { out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); } return; } // first byte for (int i = 0; i < 8 - (i_begin % 8); i++) { *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); } // middle bytes for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { _ArrowBitsUnpackInt8(bits[i], out); out += 8; } // last byte const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); for (int i = 0; i < bits_remaining; i++) { *out++ = ArrowBitGet(&bits[bytes_last_valid], i); } } static inline void ArrowBitsUnpackInt32(const uint8_t* bits, int64_t start_offset, int64_t length, int32_t* out) { if (length == 0) { return; } const int64_t i_begin = start_offset; const int64_t i_end = start_offset + length; const int64_t i_last_valid = i_end - 1; const int64_t bytes_begin = i_begin / 8; const int64_t bytes_last_valid = i_last_valid / 8; if (bytes_begin == bytes_last_valid) { for (int i = 0; i < length; i++) { out[i] = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); } return; } // first byte for (int i = 0; i < 8 - (i_begin % 8); i++) { *out++ = ArrowBitGet(&bits[bytes_begin], i + i_begin % 8); } // middle bytes for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { _ArrowBitsUnpackInt32(bits[i], out); out += 8; } // last byte const int bits_remaining = (int)(i_end % 8 == 0 ? 8 : i_end % 8); for (int i = 0; i < bits_remaining; i++) { *out++ = ArrowBitGet(&bits[bytes_last_valid], i); } } static inline void ArrowBitSet(uint8_t* bits, int64_t i) { bits[i / 8] |= _ArrowkBitmask[i % 8]; } static inline void ArrowBitClear(uint8_t* bits, int64_t i) { bits[i / 8] &= _ArrowkFlippedBitmask[i % 8]; } static inline void ArrowBitSetTo(uint8_t* bits, int64_t i, uint8_t bit_is_set) { bits[i / 8] ^= ((uint8_t)(-((uint8_t)(bit_is_set != 0)) ^ bits[i / 8])) & _ArrowkBitmask[i % 8]; } static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t length, uint8_t bits_are_set) { const int64_t i_begin = start_offset; const int64_t i_end = start_offset + length; const uint8_t fill_byte = (uint8_t)(-bits_are_set); const int64_t bytes_begin = i_begin / 8; const int64_t bytes_end = i_end / 8 + 1; const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_end % 8]; if (bytes_end == bytes_begin + 1) { // set bits within a single byte const uint8_t only_byte_mask = i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); bits[bytes_begin] &= only_byte_mask; bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); return; } // set/clear trailing bits of first byte bits[bytes_begin] &= first_byte_mask; bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); if (bytes_end - bytes_begin > 2) { // set/clear whole bytes memset(bits + bytes_begin + 1, fill_byte, (size_t)(bytes_end - bytes_begin - 2)); } if (i_end % 8 == 0) { return; } // set/clear leading bits of last byte bits[bytes_end - 1] &= last_byte_mask; bits[bytes_end - 1] |= (uint8_t)(fill_byte & ~last_byte_mask); } static inline int64_t ArrowBitCountSet(const uint8_t* bits, int64_t start_offset, int64_t length) { if (length == 0) { return 0; } const int64_t i_begin = start_offset; const int64_t i_end = start_offset + length; const int64_t i_last_valid = i_end - 1; const int64_t bytes_begin = i_begin / 8; const int64_t bytes_last_valid = i_last_valid / 8; if (bytes_begin == bytes_last_valid) { // count bits within a single byte const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_end % 8]; const uint8_t last_byte_mask = _ArrowkTrailingBitmask[i_begin % 8]; const uint8_t only_byte_mask = i_end % 8 == 0 ? last_byte_mask : (uint8_t)(first_byte_mask & last_byte_mask); const uint8_t byte_masked = bits[bytes_begin] & only_byte_mask; return _ArrowkBytePopcount[byte_masked]; } const uint8_t first_byte_mask = _ArrowkPrecedingBitmask[i_begin % 8]; const uint8_t last_byte_mask = i_end % 8 == 0 ? 0 : _ArrowkTrailingBitmask[i_end % 8]; int64_t count = 0; // first byte count += _ArrowkBytePopcount[bits[bytes_begin] & ~first_byte_mask]; // middle bytes for (int64_t i = bytes_begin + 1; i < bytes_last_valid; i++) { count += _ArrowkBytePopcount[bits[i]]; } // last byte count += _ArrowkBytePopcount[bits[bytes_last_valid] & ~last_byte_mask]; return count; } static inline void ArrowBitmapInit(struct ArrowBitmap* bitmap) { ArrowBufferInit(&bitmap->buffer); bitmap->size_bits = 0; } static inline void ArrowBitmapMove(struct ArrowBitmap* src, struct ArrowBitmap* dst) { ArrowBufferMove(&src->buffer, &dst->buffer); dst->size_bits = src->size_bits; src->size_bits = 0; } static inline ArrowErrorCode ArrowBitmapReserve(struct ArrowBitmap* bitmap, int64_t additional_size_bits) { int64_t min_capacity_bits = bitmap->size_bits + additional_size_bits; if (min_capacity_bits <= (bitmap->buffer.capacity_bytes * 8)) { return NANOARROW_OK; } NANOARROW_RETURN_NOT_OK( ArrowBufferReserve(&bitmap->buffer, _ArrowBytesForBits(additional_size_bits))); bitmap->buffer.data[bitmap->buffer.capacity_bytes - 1] = 0; return NANOARROW_OK; } static inline ArrowErrorCode ArrowBitmapResize(struct ArrowBitmap* bitmap, int64_t new_capacity_bits, char shrink_to_fit) { if (new_capacity_bits < 0) { return EINVAL; } int64_t new_capacity_bytes = _ArrowBytesForBits(new_capacity_bits); NANOARROW_RETURN_NOT_OK( ArrowBufferResize(&bitmap->buffer, new_capacity_bytes, shrink_to_fit)); if (new_capacity_bits < bitmap->size_bits) { bitmap->size_bits = new_capacity_bits; } return NANOARROW_OK; } static inline ArrowErrorCode ArrowBitmapAppend(struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length) { NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(bitmap, length)); ArrowBitmapAppendUnsafe(bitmap, bits_are_set, length); return NANOARROW_OK; } static inline void ArrowBitmapAppendUnsafe(struct ArrowBitmap* bitmap, uint8_t bits_are_set, int64_t length) { ArrowBitsSetTo(bitmap->buffer.data, bitmap->size_bits, length, bits_are_set); bitmap->size_bits += length; bitmap->buffer.size_bytes = _ArrowBytesForBits(bitmap->size_bits); } static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, const int8_t* values, int64_t n_values) { if (n_values == 0) { return; } const int8_t* values_cursor = values; int64_t n_remaining = n_values; int64_t out_i_cursor = bitmap->size_bits; uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; // First byte if ((out_i_cursor % 8) != 0) { int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; for (int i = 0; i < n_partial_bits; i++) { ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values[i]); } out_cursor++; values_cursor += n_partial_bits; n_remaining -= n_partial_bits; } // Middle bytes int64_t n_full_bytes = n_remaining / 8; for (int64_t i = 0; i < n_full_bytes; i++) { _ArrowBitmapPackInt8(values_cursor, out_cursor); values_cursor += 8; out_cursor++; } // Last byte out_i_cursor += n_full_bytes * 8; n_remaining -= n_full_bytes * 8; if (n_remaining > 0) { // Zero out the last byte *out_cursor = 0x00; for (int i = 0; i < n_remaining; i++) { ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); } out_cursor++; } bitmap->size_bits += n_values; bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; } static inline void ArrowBitmapAppendInt32Unsafe(struct ArrowBitmap* bitmap, const int32_t* values, int64_t n_values) { if (n_values == 0) { return; } const int32_t* values_cursor = values; int64_t n_remaining = n_values; int64_t out_i_cursor = bitmap->size_bits; uint8_t* out_cursor = bitmap->buffer.data + bitmap->size_bits / 8; // First byte if ((out_i_cursor % 8) != 0) { int64_t n_partial_bits = _ArrowRoundUpToMultipleOf8(out_i_cursor) - out_i_cursor; for (int i = 0; i < n_partial_bits; i++) { ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values[i]); } out_cursor++; values_cursor += n_partial_bits; n_remaining -= n_partial_bits; } // Middle bytes int64_t n_full_bytes = n_remaining / 8; for (int64_t i = 0; i < n_full_bytes; i++) { _ArrowBitmapPackInt32(values_cursor, out_cursor); values_cursor += 8; out_cursor++; } // Last byte out_i_cursor += n_full_bytes * 8; n_remaining -= n_full_bytes * 8; if (n_remaining > 0) { // Zero out the last byte *out_cursor = 0x00; for (int i = 0; i < n_remaining; i++) { ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, (uint8_t)values_cursor[i]); } out_cursor++; } bitmap->size_bits += n_values; bitmap->buffer.size_bytes = out_cursor - bitmap->buffer.data; } static inline void ArrowBitmapReset(struct ArrowBitmap* bitmap) { ArrowBufferReset(&bitmap->buffer); bitmap->size_bits = 0; } #ifdef __cplusplus } #endif #endif // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef NANOARROW_ARRAY_INLINE_H_INCLUDED #define NANOARROW_ARRAY_INLINE_H_INCLUDED #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif static inline struct ArrowBitmap* ArrowArrayValidityBitmap(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; return &private_data->bitmap; } static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int64_t i) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; switch (i) { case 0: return &private_data->bitmap.buffer; default: return private_data->buffers + i - 1; } } // We don't currently support the case of unions where type_id != child_index; // however, these functions are used to keep track of where that assumption // is made. static inline int8_t _ArrowArrayUnionChildIndex(struct ArrowArray* array, int8_t type_id) { NANOARROW_UNUSED(array); return type_id; } static inline int8_t _ArrowArrayUnionTypeId(struct ArrowArray* array, int8_t child_index) { NANOARROW_UNUSED(array); return child_index; } static inline int32_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) { if (*type_ids == '\0') { return 0; } int32_t i = 0; long type_id; char* end_ptr; do { type_id = strtol(type_ids, &end_ptr, 10); if (end_ptr == type_ids || type_id < 0 || type_id > 127) { return -1; } if (out != NULL) { out[i] = (int8_t)type_id; } i++; type_ids = end_ptr; if (*type_ids == '\0') { return i; } else if (*type_ids != ',') { return -1; } else { type_ids++; } } while (1); return -1; } static inline int8_t _ArrowParsedUnionTypeIdsWillEqualChildIndices(const int8_t* type_ids, int64_t n_type_ids, int64_t n_children) { if (n_type_ids != n_children) { return 0; } for (int8_t i = 0; i < n_type_ids; i++) { if (type_ids[i] != i) { return 0; } } return 1; } static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id_str, int64_t n_children) { int8_t type_ids[128]; int32_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids); return _ArrowParsedUnionTypeIdsWillEqualChildIndices(type_ids, n_type_ids, n_children); } static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; switch (private_data->storage_type) { case NANOARROW_TYPE_UNINITIALIZED: return EINVAL; case NANOARROW_TYPE_SPARSE_UNION: case NANOARROW_TYPE_DENSE_UNION: // Note that this value could be -1 if the type_ids string was invalid if (private_data->union_type_id_is_child_index != 1) { return EINVAL; } else { break; } default: break; } if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) { return EINVAL; } // Initialize any data offset buffer with a single zero for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && private_data->layout.element_size_bits[i] == 64) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); } else if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && private_data->layout.element_size_bits[i] == 32) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(ArrowArrayBuffer(array, i), 0)); } } // Start building any child arrays or dictionaries for (int64_t i = 0; i < array->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i])); } if (array->dictionary != NULL) { NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary)); } return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); } for (int64_t i = 0; i < array->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i])); } if (array->dictionary != NULL) { NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary)); } return NANOARROW_OK; } static inline ArrowErrorCode _ArrowArrayAppendBits(struct ArrowArray* array, int64_t buffer_i, uint8_t value, int64_t n) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; struct ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); int64_t bytes_required = _ArrowRoundUpToMultipleOf8(private_data->layout.element_size_bits[buffer_i] * (array->length + 1)) / 8; if (bytes_required > buffer->size_bytes) { NANOARROW_RETURN_NOT_OK( ArrowBufferAppendFill(buffer, 0, bytes_required - buffer->size_bytes)); } ArrowBitsSetTo(buffer->data, array->length, n, value); return NANOARROW_OK; } static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* array, int64_t n, uint8_t is_valid) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; if (n == 0) { return NANOARROW_OK; } // Some type-specific handling switch (private_data->storage_type) { case NANOARROW_TYPE_NA: // (An empty value for a null array *is* a null) array->null_count += n; array->length += n; return NANOARROW_OK; case NANOARROW_TYPE_DENSE_UNION: { // Add one null to the first child and append n references to that child int8_t type_id = _ArrowArrayUnionTypeId(array, 0); NANOARROW_RETURN_NOT_OK( _ArrowArrayAppendEmptyInternal(array->children[0], 1, is_valid)); NANOARROW_RETURN_NOT_OK( ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); for (int64_t i = 0; i < n; i++) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( ArrowArrayBuffer(array, 1), (int32_t)array->children[0]->length - 1)); } // For the purposes of array->null_count, union elements are never considered "null" // even if some children contain nulls. array->length += n; return NANOARROW_OK; } case NANOARROW_TYPE_SPARSE_UNION: { // Add n nulls to the first child and append n references to that child int8_t type_id = _ArrowArrayUnionTypeId(array, 0); NANOARROW_RETURN_NOT_OK( _ArrowArrayAppendEmptyInternal(array->children[0], n, is_valid)); for (int64_t i = 1; i < array->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); } NANOARROW_RETURN_NOT_OK( ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n)); // For the purposes of array->null_count, union elements are never considered "null" // even if some children contain nulls. array->length += n; return NANOARROW_OK; } case NANOARROW_TYPE_FIXED_SIZE_LIST: NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty( array->children[0], n * private_data->layout.child_size_elements)); break; case NANOARROW_TYPE_STRUCT: for (int64_t i = 0; i < array->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n)); } break; default: break; } // Append n is_valid bits to the validity bitmap. If we haven't allocated a bitmap yet // and we need to append nulls, do it now. if (!is_valid && private_data->bitmap.buffer.data == NULL) { NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, array->length + n)); ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length); ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); } else if (private_data->bitmap.buffer.data != NULL) { NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n)); ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n); } // Add appropriate buffer fill struct ArrowBuffer* buffer; int64_t size_bytes; for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { buffer = ArrowArrayBuffer(array, i); size_bytes = private_data->layout.element_size_bits[i] / 8; switch (private_data->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_NONE: case NANOARROW_BUFFER_TYPE_VALIDITY: continue; case NANOARROW_BUFFER_TYPE_DATA_OFFSET: // Append the current value at the end of the offset buffer for each element NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, size_bytes * n)); for (int64_t j = 0; j < n; j++) { ArrowBufferAppendUnsafe(buffer, buffer->data + size_bytes * (array->length + j), size_bytes); } // Skip the data buffer i++; continue; case NANOARROW_BUFFER_TYPE_DATA: // Zero out the next bit of memory if (private_data->layout.element_size_bits[i] % 8 == 0) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFill(buffer, 0, size_bytes * n)); } else { NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, i, 0, n)); } continue; case NANOARROW_BUFFER_TYPE_TYPE_ID: case NANOARROW_BUFFER_TYPE_UNION_OFFSET: // These cases return above return EINVAL; } } array->length += n; array->null_count += n * !is_valid; return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n) { return _ArrowArrayAppendEmptyInternal(array, n, 0); } static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n) { return _ArrowArrayAppendEmptyInternal(array, n, 1); } static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array, int64_t value) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); switch (private_data->storage_type) { case NANOARROW_TYPE_INT64: NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(int64_t))); break; case NANOARROW_TYPE_INT32: _NANOARROW_CHECK_RANGE(value, INT32_MIN, INT32_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, (int32_t)value)); break; case NANOARROW_TYPE_INT16: _NANOARROW_CHECK_RANGE(value, INT16_MIN, INT16_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt16(data_buffer, (int16_t)value)); break; case NANOARROW_TYPE_INT8: _NANOARROW_CHECK_RANGE(value, INT8_MIN, INT8_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(data_buffer, (int8_t)value)); break; case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_UINT8: _NANOARROW_CHECK_RANGE(value, 0, INT64_MAX); return ArrowArrayAppendUInt(array, value); case NANOARROW_TYPE_DOUBLE: NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); break; case NANOARROW_TYPE_FLOAT: NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); break; case NANOARROW_TYPE_BOOL: NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); break; default: return EINVAL; } if (private_data->bitmap.buffer.data != NULL) { NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); } array->length++; return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayAppendUInt(struct ArrowArray* array, uint64_t value) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); switch (private_data->storage_type) { case NANOARROW_TYPE_UINT64: NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(uint64_t))); break; case NANOARROW_TYPE_UINT32: _NANOARROW_CHECK_UPPER_LIMIT(value, UINT32_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt32(data_buffer, (uint32_t)value)); break; case NANOARROW_TYPE_UINT16: _NANOARROW_CHECK_UPPER_LIMIT(value, UINT16_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt16(data_buffer, (uint16_t)value)); break; case NANOARROW_TYPE_UINT8: _NANOARROW_CHECK_UPPER_LIMIT(value, UINT8_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(data_buffer, (uint8_t)value)); break; case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_INT8: _NANOARROW_CHECK_UPPER_LIMIT(value, INT64_MAX); return ArrowArrayAppendInt(array, value); case NANOARROW_TYPE_DOUBLE: NANOARROW_RETURN_NOT_OK(ArrowBufferAppendDouble(data_buffer, (double)value)); break; case NANOARROW_TYPE_FLOAT: NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); break; case NANOARROW_TYPE_BOOL: NANOARROW_RETURN_NOT_OK(_ArrowArrayAppendBits(array, 1, value != 0, 1)); break; default: return EINVAL; } if (private_data->bitmap.buffer.data != NULL) { NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); } array->length++; return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array, double value) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); switch (private_data->storage_type) { case NANOARROW_TYPE_DOUBLE: NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &value, sizeof(double))); break; case NANOARROW_TYPE_FLOAT: NANOARROW_RETURN_NOT_OK(ArrowBufferAppendFloat(data_buffer, (float)value)); break; default: return EINVAL; } if (private_data->bitmap.buffer.data != NULL) { NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); } array->length++; return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, struct ArrowBufferView value) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); struct ArrowBuffer* data_buffer = ArrowArrayBuffer( array, 1 + (private_data->storage_type != NANOARROW_TYPE_FIXED_SIZE_BINARY)); int32_t offset; int64_t large_offset; int64_t fixed_size_bytes = private_data->layout.element_size_bits[1] / 8; switch (private_data->storage_type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: offset = ((int32_t*)offset_buffer->data)[array->length]; if ((((int64_t)offset) + value.size_bytes) > INT32_MAX) { return EOVERFLOW; } offset += (int32_t)value.size_bytes; NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(offset_buffer, &offset, sizeof(int32_t))); NANOARROW_RETURN_NOT_OK( ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); break; case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_LARGE_BINARY: large_offset = ((int64_t*)offset_buffer->data)[array->length]; large_offset += value.size_bytes; NANOARROW_RETURN_NOT_OK( ArrowBufferAppend(offset_buffer, &large_offset, sizeof(int64_t))); NANOARROW_RETURN_NOT_OK( ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); break; case NANOARROW_TYPE_FIXED_SIZE_BINARY: if (value.size_bytes != fixed_size_bytes) { return EINVAL; } NANOARROW_RETURN_NOT_OK( ArrowBufferAppend(data_buffer, value.data.data, value.size_bytes)); break; default: return EINVAL; } if (private_data->bitmap.buffer.data != NULL) { NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); } array->length++; return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array, struct ArrowStringView value) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; struct ArrowBufferView buffer_view; buffer_view.data.data = value.data; buffer_view.size_bytes = value.size_bytes; switch (private_data->storage_type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_BINARY: case NANOARROW_TYPE_LARGE_BINARY: return ArrowArrayAppendBytes(array, buffer_view); default: return EINVAL; } } static inline ArrowErrorCode ArrowArrayAppendInterval(struct ArrowArray* array, const struct ArrowInterval* value) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); switch (private_data->storage_type) { case NANOARROW_TYPE_INTERVAL_MONTHS: { if (value->type != NANOARROW_TYPE_INTERVAL_MONTHS) { return EINVAL; } NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); break; } case NANOARROW_TYPE_INTERVAL_DAY_TIME: { if (value->type != NANOARROW_TYPE_INTERVAL_DAY_TIME) { return EINVAL; } NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->ms)); break; } case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { if (value->type != NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO) { return EINVAL; } NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->months)); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(data_buffer, value->days)); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(data_buffer, value->ns)); break; } default: return EINVAL; } if (private_data->bitmap.buffer.data != NULL) { NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); } array->length++; return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayAppendDecimal(struct ArrowArray* array, const struct ArrowDecimal* value) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); switch (private_data->storage_type) { case NANOARROW_TYPE_DECIMAL128: if (value->n_words != 2) { return EINVAL; } else { NANOARROW_RETURN_NOT_OK( ArrowBufferAppend(data_buffer, value->words, 2 * sizeof(uint64_t))); break; } case NANOARROW_TYPE_DECIMAL256: if (value->n_words != 4) { return EINVAL; } else { NANOARROW_RETURN_NOT_OK( ArrowBufferAppend(data_buffer, value->words, 4 * sizeof(uint64_t))); break; } default: return EINVAL; } if (private_data->bitmap.buffer.data != NULL) { NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); } array->length++; return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; int64_t child_length; switch (private_data->storage_type) { case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_MAP: child_length = array->children[0]->length; if (child_length > INT32_MAX) { return EOVERFLOW; } NANOARROW_RETURN_NOT_OK( ArrowBufferAppendInt32(ArrowArrayBuffer(array, 1), (int32_t)child_length)); break; case NANOARROW_TYPE_LARGE_LIST: child_length = array->children[0]->length; NANOARROW_RETURN_NOT_OK( ArrowBufferAppendInt64(ArrowArrayBuffer(array, 1), child_length)); break; case NANOARROW_TYPE_FIXED_SIZE_LIST: child_length = array->children[0]->length; if (child_length != ((array->length + 1) * private_data->layout.child_size_elements)) { return EINVAL; } break; case NANOARROW_TYPE_STRUCT: for (int64_t i = 0; i < array->n_children; i++) { child_length = array->children[i]->length; if (child_length != (array->length + 1)) { return EINVAL; } } break; default: return EINVAL; } if (private_data->bitmap.buffer.data != NULL) { NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1)); } array->length++; return NANOARROW_OK; } static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array, int8_t type_id) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id); if (child_index < 0 || child_index >= array->n_children) { return EINVAL; } switch (private_data->storage_type) { case NANOARROW_TYPE_DENSE_UNION: // Append the target child length to the union offsets buffer _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX); NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32( ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1)); break; case NANOARROW_TYPE_SPARSE_UNION: // Append one empty to any non-target column that isn't already the right length // or abort if appending a null will result in a column with invalid length for (int64_t i = 0; i < array->n_children; i++) { if (i == child_index || array->children[i]->length == (array->length + 1)) { continue; } if (array->children[i]->length != array->length) { return EINVAL; } NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], 1)); } break; default: return EINVAL; } // Write to the type_ids buffer NANOARROW_RETURN_NOT_OK( ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id)); array->length++; return NANOARROW_OK; } static inline void ArrowArrayViewMove(struct ArrowArrayView* src, struct ArrowArrayView* dst) { memcpy(dst, src, sizeof(struct ArrowArrayView)); ArrowArrayViewInitFromType(src, NANOARROW_TYPE_UNINITIALIZED); } static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view, int64_t i) { const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8; i += array_view->offset; switch (array_view->storage_type) { case NANOARROW_TYPE_NA: return 0x01; case NANOARROW_TYPE_DENSE_UNION: case NANOARROW_TYPE_SPARSE_UNION: // Unions are "never null" in Arrow land return 0x00; default: return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i); } } static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view, int64_t i) { switch (array_view->storage_type) { case NANOARROW_TYPE_DENSE_UNION: case NANOARROW_TYPE_SPARSE_UNION: return array_view->buffer_views[0].data.as_int8[i]; default: return -1; } } static inline int8_t ArrowArrayViewUnionChildIndex( const struct ArrowArrayView* array_view, int64_t i) { int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i); if (array_view->union_type_id_map == NULL) { return type_id; } else { return array_view->union_type_id_map[type_id]; } } static inline int64_t ArrowArrayViewUnionChildOffset( const struct ArrowArrayView* array_view, int64_t i) { switch (array_view->storage_type) { case NANOARROW_TYPE_DENSE_UNION: return array_view->buffer_views[1].data.as_int32[i]; case NANOARROW_TYPE_SPARSE_UNION: return i; default: return -1; } } static inline int64_t ArrowArrayViewListChildOffset( const struct ArrowArrayView* array_view, int64_t i) { switch (array_view->storage_type) { case NANOARROW_TYPE_LIST: return array_view->buffer_views[1].data.as_int32[i]; case NANOARROW_TYPE_LARGE_LIST: return array_view->buffer_views[1].data.as_int64[i]; default: return -1; } } static inline int64_t ArrowArrayViewGetIntUnsafe(const struct ArrowArrayView* array_view, int64_t i) { const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; i += array_view->offset; switch (array_view->storage_type) { case NANOARROW_TYPE_INT64: return data_view->data.as_int64[i]; case NANOARROW_TYPE_UINT64: return data_view->data.as_uint64[i]; case NANOARROW_TYPE_INTERVAL_MONTHS: case NANOARROW_TYPE_INT32: return data_view->data.as_int32[i]; case NANOARROW_TYPE_UINT32: return data_view->data.as_uint32[i]; case NANOARROW_TYPE_INT16: return data_view->data.as_int16[i]; case NANOARROW_TYPE_UINT16: return data_view->data.as_uint16[i]; case NANOARROW_TYPE_INT8: return data_view->data.as_int8[i]; case NANOARROW_TYPE_UINT8: return data_view->data.as_uint8[i]; case NANOARROW_TYPE_DOUBLE: return (int64_t)data_view->data.as_double[i]; case NANOARROW_TYPE_FLOAT: return (int64_t)data_view->data.as_float[i]; case NANOARROW_TYPE_BOOL: return ArrowBitGet(data_view->data.as_uint8, i); default: return INT64_MAX; } } static inline uint64_t ArrowArrayViewGetUIntUnsafe( const struct ArrowArrayView* array_view, int64_t i) { i += array_view->offset; const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; switch (array_view->storage_type) { case NANOARROW_TYPE_INT64: return data_view->data.as_int64[i]; case NANOARROW_TYPE_UINT64: return data_view->data.as_uint64[i]; case NANOARROW_TYPE_INTERVAL_MONTHS: case NANOARROW_TYPE_INT32: return data_view->data.as_int32[i]; case NANOARROW_TYPE_UINT32: return data_view->data.as_uint32[i]; case NANOARROW_TYPE_INT16: return data_view->data.as_int16[i]; case NANOARROW_TYPE_UINT16: return data_view->data.as_uint16[i]; case NANOARROW_TYPE_INT8: return data_view->data.as_int8[i]; case NANOARROW_TYPE_UINT8: return data_view->data.as_uint8[i]; case NANOARROW_TYPE_DOUBLE: return (uint64_t)data_view->data.as_double[i]; case NANOARROW_TYPE_FLOAT: return (uint64_t)data_view->data.as_float[i]; case NANOARROW_TYPE_BOOL: return ArrowBitGet(data_view->data.as_uint8, i); default: return UINT64_MAX; } } static inline double ArrowArrayViewGetDoubleUnsafe( const struct ArrowArrayView* array_view, int64_t i) { i += array_view->offset; const struct ArrowBufferView* data_view = &array_view->buffer_views[1]; switch (array_view->storage_type) { case NANOARROW_TYPE_INT64: return (double)data_view->data.as_int64[i]; case NANOARROW_TYPE_UINT64: return (double)data_view->data.as_uint64[i]; case NANOARROW_TYPE_INT32: return data_view->data.as_int32[i]; case NANOARROW_TYPE_UINT32: return data_view->data.as_uint32[i]; case NANOARROW_TYPE_INT16: return data_view->data.as_int16[i]; case NANOARROW_TYPE_UINT16: return data_view->data.as_uint16[i]; case NANOARROW_TYPE_INT8: return data_view->data.as_int8[i]; case NANOARROW_TYPE_UINT8: return data_view->data.as_uint8[i]; case NANOARROW_TYPE_DOUBLE: return data_view->data.as_double[i]; case NANOARROW_TYPE_FLOAT: return data_view->data.as_float[i]; case NANOARROW_TYPE_BOOL: return ArrowBitGet(data_view->data.as_uint8, i); default: return DBL_MAX; } } static inline struct ArrowStringView ArrowArrayViewGetStringUnsafe( const struct ArrowArrayView* array_view, int64_t i) { i += array_view->offset; const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; const char* data_view = array_view->buffer_views[2].data.as_char; struct ArrowStringView view; switch (array_view->storage_type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: view.data = data_view + offsets_view->data.as_int32[i]; view.size_bytes = offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; break; case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_LARGE_BINARY: view.data = data_view + offsets_view->data.as_int64[i]; view.size_bytes = offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; break; case NANOARROW_TYPE_FIXED_SIZE_BINARY: view.size_bytes = array_view->layout.element_size_bits[1] / 8; view.data = array_view->buffer_views[1].data.as_char + (i * view.size_bytes); break; default: view.data = NULL; view.size_bytes = 0; break; } return view; } static inline struct ArrowBufferView ArrowArrayViewGetBytesUnsafe( const struct ArrowArrayView* array_view, int64_t i) { i += array_view->offset; const struct ArrowBufferView* offsets_view = &array_view->buffer_views[1]; const uint8_t* data_view = array_view->buffer_views[2].data.as_uint8; struct ArrowBufferView view; switch (array_view->storage_type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: view.size_bytes = offsets_view->data.as_int32[i + 1] - offsets_view->data.as_int32[i]; view.data.as_uint8 = data_view + offsets_view->data.as_int32[i]; break; case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_LARGE_BINARY: view.size_bytes = offsets_view->data.as_int64[i + 1] - offsets_view->data.as_int64[i]; view.data.as_uint8 = data_view + offsets_view->data.as_int64[i]; break; case NANOARROW_TYPE_FIXED_SIZE_BINARY: view.size_bytes = array_view->layout.element_size_bits[1] / 8; view.data.as_uint8 = array_view->buffer_views[1].data.as_uint8 + (i * view.size_bytes); break; default: view.data.data = NULL; view.size_bytes = 0; break; } return view; } static inline void ArrowArrayViewGetIntervalUnsafe( const struct ArrowArrayView* array_view, int64_t i, struct ArrowInterval* out) { const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; switch (array_view->storage_type) { case NANOARROW_TYPE_INTERVAL_MONTHS: { const size_t size = sizeof(int32_t); memcpy(&out->months, data_view + i * size, sizeof(int32_t)); break; } case NANOARROW_TYPE_INTERVAL_DAY_TIME: { const size_t size = sizeof(int32_t) + sizeof(int32_t); memcpy(&out->days, data_view + i * size, sizeof(int32_t)); memcpy(&out->ms, data_view + i * size + 4, sizeof(int32_t)); break; } case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { const size_t size = sizeof(int32_t) + sizeof(int32_t) + sizeof(int64_t); memcpy(&out->months, data_view + i * size, sizeof(int32_t)); memcpy(&out->days, data_view + i * size + 4, sizeof(int32_t)); memcpy(&out->ns, data_view + i * size + 8, sizeof(int64_t)); break; } default: break; } } static inline void ArrowArrayViewGetDecimalUnsafe(const struct ArrowArrayView* array_view, int64_t i, struct ArrowDecimal* out) { i += array_view->offset; const uint8_t* data_view = array_view->buffer_views[1].data.as_uint8; switch (array_view->storage_type) { case NANOARROW_TYPE_DECIMAL128: ArrowDecimalSetBytes(out, data_view + (i * 16)); break; case NANOARROW_TYPE_DECIMAL256: ArrowDecimalSetBytes(out, data_view + (i * 32)); break; default: memset(out->words, 0, sizeof(out->words)); break; } } #ifdef __cplusplus } #endif #endif nanoarrow/src/materialize_int64.h0000644000176200001440000001057514502402562016562 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_INT64_H_INCLUDED #define R_MATERIALIZE_INT64_H_INCLUDED #include #include #include "materialize_common.h" #include "nanoarrow.h" #define NA_INTEGER64 INT64_MIN static inline int nanoarrow_materialize_int64(struct ArrayViewSlice* src, struct VectorSlice* dst, struct MaterializeOptions* options) { if (src->array_view->array->dictionary != NULL) { return ENOTSUP; } int64_t* result = (int64_t*)REAL(dst->vec_sexp); int64_t n_bad_values = 0; // True for all the types supported here const uint8_t* is_valid = src->array_view->buffer_views[0].data.as_uint8; int64_t raw_src_offset = src->array_view->array->offset + src->offset; // Fill the buffer switch (src->array_view->storage_type) { case NANOARROW_TYPE_NA: for (R_xlen_t i = 0; i < dst->length; i++) { result[dst->offset + i] = NA_INTEGER64; } break; case NANOARROW_TYPE_INT64: memcpy(result + dst->offset, src->array_view->buffer_views[1].data.as_int32 + raw_src_offset, dst->length * sizeof(int64_t)); // Set any nulls to NA_INTEGER64 if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[dst->offset + i] = NA_INTEGER64; } } } break; case NANOARROW_TYPE_BOOL: case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_UINT32: // No need to bounds check for these types for (R_xlen_t i = 0; i < dst->length; i++) { result[dst->offset + i] = ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i); } // Set any nulls to NA_INTEGER if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[dst->offset + i] = NA_INTEGER64; } } } break; case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_FLOAT: case NANOARROW_TYPE_DOUBLE: // Loop + bounds check. Because we don't know what memory might be // in a null slot, we have to check nulls if there are any. if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (ArrowBitGet(is_valid, raw_src_offset + i)) { int64_t value = ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i); if (value > INT64_MAX || value <= NA_INTEGER64) { result[dst->offset + i] = NA_INTEGER64; n_bad_values++; } else { result[dst->offset + i] = value; } } else { result[dst->offset + i] = NA_INTEGER64; } } } else { for (R_xlen_t i = 0; i < dst->length; i++) { int64_t value = ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i); if (value > INT64_MAX || value <= NA_INTEGER64) { result[dst->offset + i] = NA_INTEGER64; n_bad_values++; } else { result[dst->offset + i] = value; } } } break; default: return EINVAL; } if (n_bad_values > 0) { warn_lossy_conversion(n_bad_values, "outside integer64 range set to NA"); } return NANOARROW_OK; } #endif nanoarrow/src/materialize_blob.h0000644000176200001440000000400714502402562016525 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_BLOB_H_INCLUDED #define R_MATERIALIZE_BLOB_H_INCLUDED #include #include #include "materialize_common.h" #include "nanoarrow.h" static inline int nanoarrow_materialize_blob(struct ArrayViewSlice* src, struct VectorSlice* dst, struct MaterializeOptions* options) { switch (src->array_view->storage_type) { case NANOARROW_TYPE_NA: case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_BINARY: case NANOARROW_TYPE_LARGE_BINARY: break; default: return ENOTSUP; } if (src->array_view->storage_type == NANOARROW_TYPE_NA) { return NANOARROW_OK; } struct ArrowBufferView item; SEXP item_sexp; for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) { item = ArrowArrayViewGetBytesUnsafe(src->array_view, src->offset + i); item_sexp = PROTECT(Rf_allocVector(RAWSXP, item.size_bytes)); memcpy(RAW(item_sexp), item.data.data, item.size_bytes); SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i, item_sexp); UNPROTECT(1); } } return NANOARROW_OK; } #endif nanoarrow/src/materialize_dbl.h0000644000176200001440000000772114502402562016356 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_DBL_H_INCLUDED #define R_MATERIALIZE_DBL_H_INCLUDED #include #include #include "materialize_common.h" #include "nanoarrow.h" // bit64::as.integer64(2^53) #define MAX_DBL_AS_INTEGER 9007199254740992 static inline int nanoarrow_materialize_dbl(struct RConverter* converter) { if (converter->src.array_view->array->dictionary != NULL) { return ENOTSUP; } struct ArrayViewSlice* src = &converter->src; struct VectorSlice* dst = &converter->dst; double* result = REAL(dst->vec_sexp); int64_t n_bad_values = 0; // True for all the types supported here const uint8_t* is_valid = src->array_view->buffer_views[0].data.as_uint8; int64_t raw_src_offset = src->array_view->array->offset + src->offset; // Fill the buffer switch (src->array_view->storage_type) { case NANOARROW_TYPE_NA: for (R_xlen_t i = 0; i < dst->length; i++) { result[dst->offset + i] = NA_REAL; } break; case NANOARROW_TYPE_DOUBLE: memcpy(result + dst->offset, src->array_view->buffer_views[1].data.as_double + raw_src_offset, dst->length * sizeof(double)); // Set any nulls to NA_REAL if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[dst->offset + i] = NA_REAL; } } } break; case NANOARROW_TYPE_BOOL: case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_FLOAT: // No need to bounds check these types for (R_xlen_t i = 0; i < dst->length; i++) { result[dst->offset + i] = ArrowArrayViewGetDoubleUnsafe(src->array_view, src->offset + i); } // Set any nulls to NA_REAL if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[dst->offset + i] = NA_REAL; } } } break; case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_UINT64: for (R_xlen_t i = 0; i < dst->length; i++) { double value = ArrowArrayViewGetDoubleUnsafe(src->array_view, src->offset + i); if (value > MAX_DBL_AS_INTEGER || value < -MAX_DBL_AS_INTEGER) { // Content of null slot is undefined n_bad_values += is_valid == NULL || ArrowBitGet(is_valid, raw_src_offset + i); } result[dst->offset + i] = value; } // Set any nulls to NA_REAL if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[dst->offset + i] = NA_REAL; } } } break; default: return EINVAL; } if (n_bad_values > 0) { warn_lossy_conversion( n_bad_values, "may have incurred loss of precision in conversion to double()"); } return NANOARROW_OK; } #endif nanoarrow/src/init.c0000644000176200001440000002541014502402562014160 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "altrep.h" #include "util.h" /* generated by tools/make-callentries.R */ extern SEXP nanoarrow_c_make_altrep_chr(SEXP array_xptr); extern SEXP nanoarrow_c_is_altrep(SEXP x_sexp); extern SEXP nanoarrow_c_altrep_is_materialized(SEXP x_sexp); extern SEXP nanoarrow_c_altrep_force_materialize(SEXP x_sexp, SEXP recursive_sexp); extern SEXP nanoarrow_c_array_stream_get_schema(SEXP array_stream_xptr); extern SEXP nanoarrow_c_array_stream_get_next(SEXP array_stream_xptr); extern SEXP nanoarrow_c_basic_array_stream(SEXP batches_sexp, SEXP schema_xptr, SEXP validate_sexp); extern SEXP nanoarrow_c_array_list_total_length(SEXP list_of_array_xptr); extern SEXP nanoarrow_c_array_view(SEXP array_xptr, SEXP schema_xptr); extern SEXP nanoarrow_c_array_init(SEXP schema_xptr); extern SEXP nanoarrow_c_array_set_length(SEXP array_xptr, SEXP length_sexp); extern SEXP nanoarrow_c_array_set_null_count(SEXP array_xptr, SEXP null_count_sexp); extern SEXP nanoarrow_c_array_set_offset(SEXP array_xptr, SEXP offset_sexp); extern SEXP nanoarrow_c_array_set_buffers(SEXP array_xptr, SEXP buffers_sexp); extern SEXP nanoarrow_c_array_set_children(SEXP array_xptr, SEXP children_sexp); extern SEXP nanoarrow_c_array_set_dictionary(SEXP array_xptr, SEXP dictionary_xptr); extern SEXP nanoarrow_c_array_validate_after_modify(SEXP array_xptr, SEXP schema_xptr); extern SEXP nanoarrow_c_array_set_schema(SEXP array_xptr, SEXP schema_xptr, SEXP validate_sexp); extern SEXP nanoarrow_c_infer_schema_array(SEXP array_xptr); extern SEXP nanoarrow_c_array_proxy(SEXP array_xptr, SEXP array_view_xptr, SEXP recursive_sexp); extern SEXP nanoarrow_c_as_array_default(SEXP x_sexp, SEXP schema_sexp); extern SEXP nanoarrow_c_as_buffer_default(SEXP x_sexp); extern SEXP nanoarrow_c_buffer_append(SEXP buffer_xptr, SEXP new_buffer_xptr); extern SEXP nanoarrow_c_buffer_info(SEXP buffer_xptr); extern SEXP nanoarrow_c_buffer_head_bytes(SEXP buffer_xptr, SEXP max_bytes_sexp); extern SEXP nanoarrow_c_buffer_as_raw(SEXP buffer_xptr); extern SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp, SEXP size_sexp, SEXP n_sexp); extern SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); extern SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp); extern SEXP nanoarrow_c_allocate_schema(void); extern SEXP nanoarrow_c_allocate_array(void); extern SEXP nanoarrow_c_allocate_array_stream(void); extern SEXP nanoarrow_c_pointer(SEXP obj_sexp); extern SEXP nanoarrow_c_pointer_addr_dbl(SEXP ptr); extern SEXP nanoarrow_c_pointer_addr_chr(SEXP ptr); extern SEXP nanoarrow_c_pointer_addr_pretty(SEXP ptr); extern SEXP nanoarrow_c_pointer_is_valid(SEXP ptr); extern SEXP nanoarrow_c_pointer_release(SEXP ptr); extern SEXP nanoarrow_c_pointer_move(SEXP ptr_src, SEXP ptr_dst); extern SEXP nanoarrow_c_export_schema(SEXP schema_xptr, SEXP ptr_dst); extern SEXP nanoarrow_c_export_array(SEXP array_xptr, SEXP ptr_dst); extern SEXP nanoarrow_c_export_array_stream(SEXP array_stream_xptr, SEXP ptr_dst); extern SEXP nanoarrow_c_pointer_set_protected(SEXP ptr_src, SEXP protected_sexp); extern SEXP nanoarrow_c_schema_init(SEXP type_id_sexp, SEXP nullable_sexp); extern SEXP nanoarrow_c_schema_init_date_time(SEXP type_id_sexp, SEXP time_unit_sexp, SEXP timezone_sexp, SEXP nullable_sexp); extern SEXP nanoarrow_c_schema_init_decimal(SEXP type_id_sexp, SEXP precision_sexp, SEXP scale_sexp, SEXP nullable_sexp); extern SEXP nanoarrow_c_schema_init_fixed_size(SEXP type_id_sexp, SEXP fixed_size_sexp, SEXP nullable_sexp); extern SEXP nanoarrow_c_schema_to_list(SEXP schema_xptr); extern SEXP nanoarrow_c_schema_parse(SEXP schema_xptr); extern SEXP nanoarrow_c_schema_format(SEXP schema_xptr, SEXP recursive_sexp); extern SEXP nanoarrow_c_schema_set_format(SEXP schema_mut_xptr, SEXP format_sexp); extern SEXP nanoarrow_c_schema_set_name(SEXP schema_mut_xptr, SEXP name_sexp); extern SEXP nanoarrow_c_schema_set_metadata(SEXP schema_mut_xptr, SEXP metadata_sexp); extern SEXP nanoarrow_c_schema_set_flags(SEXP schema_mut_xptr, SEXP flags_sexp); extern SEXP nanoarrow_c_schema_set_children(SEXP schema_mut_xptr, SEXP children_sexp); extern SEXP nanoarrow_c_schema_set_dictionary(SEXP schema_mut_xptr, SEXP dictionary_xptr); extern SEXP nanoarrow_c_preserved_count(void); extern SEXP nanoarrow_c_preserved_empty(void); extern SEXP nanoarrow_c_preserve_and_release_on_other_thread(SEXP obj); extern SEXP nanoarrow_c_version(void); extern SEXP nanoarrow_c_version_runtime(void); static const R_CallMethodDef CallEntries[] = { {"nanoarrow_c_make_altrep_chr", (DL_FUNC)&nanoarrow_c_make_altrep_chr, 1}, {"nanoarrow_c_is_altrep", (DL_FUNC)&nanoarrow_c_is_altrep, 1}, {"nanoarrow_c_altrep_is_materialized", (DL_FUNC)&nanoarrow_c_altrep_is_materialized, 1}, {"nanoarrow_c_altrep_force_materialize", (DL_FUNC)&nanoarrow_c_altrep_force_materialize, 2}, {"nanoarrow_c_array_stream_get_schema", (DL_FUNC)&nanoarrow_c_array_stream_get_schema, 1}, {"nanoarrow_c_array_stream_get_next", (DL_FUNC)&nanoarrow_c_array_stream_get_next, 1}, {"nanoarrow_c_basic_array_stream", (DL_FUNC)&nanoarrow_c_basic_array_stream, 3}, {"nanoarrow_c_array_list_total_length", (DL_FUNC)&nanoarrow_c_array_list_total_length, 1}, {"nanoarrow_c_array_view", (DL_FUNC)&nanoarrow_c_array_view, 2}, {"nanoarrow_c_array_init", (DL_FUNC)&nanoarrow_c_array_init, 1}, {"nanoarrow_c_array_set_length", (DL_FUNC)&nanoarrow_c_array_set_length, 2}, {"nanoarrow_c_array_set_null_count", (DL_FUNC)&nanoarrow_c_array_set_null_count, 2}, {"nanoarrow_c_array_set_offset", (DL_FUNC)&nanoarrow_c_array_set_offset, 2}, {"nanoarrow_c_array_set_buffers", (DL_FUNC)&nanoarrow_c_array_set_buffers, 2}, {"nanoarrow_c_array_set_children", (DL_FUNC)&nanoarrow_c_array_set_children, 2}, {"nanoarrow_c_array_set_dictionary", (DL_FUNC)&nanoarrow_c_array_set_dictionary, 2}, {"nanoarrow_c_array_validate_after_modify", (DL_FUNC)&nanoarrow_c_array_validate_after_modify, 2}, {"nanoarrow_c_array_set_schema", (DL_FUNC)&nanoarrow_c_array_set_schema, 3}, {"nanoarrow_c_infer_schema_array", (DL_FUNC)&nanoarrow_c_infer_schema_array, 1}, {"nanoarrow_c_array_proxy", (DL_FUNC)&nanoarrow_c_array_proxy, 3}, {"nanoarrow_c_as_array_default", (DL_FUNC)&nanoarrow_c_as_array_default, 2}, {"nanoarrow_c_as_buffer_default", (DL_FUNC)&nanoarrow_c_as_buffer_default, 1}, {"nanoarrow_c_buffer_append", (DL_FUNC)&nanoarrow_c_buffer_append, 2}, {"nanoarrow_c_buffer_info", (DL_FUNC)&nanoarrow_c_buffer_info, 1}, {"nanoarrow_c_buffer_head_bytes", (DL_FUNC)&nanoarrow_c_buffer_head_bytes, 2}, {"nanoarrow_c_buffer_as_raw", (DL_FUNC)&nanoarrow_c_buffer_as_raw, 1}, {"nanoarrow_c_convert_array_stream", (DL_FUNC)&nanoarrow_c_convert_array_stream, 4}, {"nanoarrow_c_infer_ptype", (DL_FUNC)&nanoarrow_c_infer_ptype, 1}, {"nanoarrow_c_convert_array", (DL_FUNC)&nanoarrow_c_convert_array, 2}, {"nanoarrow_c_allocate_schema", (DL_FUNC)&nanoarrow_c_allocate_schema, 0}, {"nanoarrow_c_allocate_array", (DL_FUNC)&nanoarrow_c_allocate_array, 0}, {"nanoarrow_c_allocate_array_stream", (DL_FUNC)&nanoarrow_c_allocate_array_stream, 0}, {"nanoarrow_c_pointer", (DL_FUNC)&nanoarrow_c_pointer, 1}, {"nanoarrow_c_pointer_addr_dbl", (DL_FUNC)&nanoarrow_c_pointer_addr_dbl, 1}, {"nanoarrow_c_pointer_addr_chr", (DL_FUNC)&nanoarrow_c_pointer_addr_chr, 1}, {"nanoarrow_c_pointer_addr_pretty", (DL_FUNC)&nanoarrow_c_pointer_addr_pretty, 1}, {"nanoarrow_c_pointer_is_valid", (DL_FUNC)&nanoarrow_c_pointer_is_valid, 1}, {"nanoarrow_c_pointer_release", (DL_FUNC)&nanoarrow_c_pointer_release, 1}, {"nanoarrow_c_pointer_move", (DL_FUNC)&nanoarrow_c_pointer_move, 2}, {"nanoarrow_c_export_schema", (DL_FUNC)&nanoarrow_c_export_schema, 2}, {"nanoarrow_c_export_array", (DL_FUNC)&nanoarrow_c_export_array, 2}, {"nanoarrow_c_export_array_stream", (DL_FUNC)&nanoarrow_c_export_array_stream, 2}, {"nanoarrow_c_pointer_set_protected", (DL_FUNC)&nanoarrow_c_pointer_set_protected, 2}, {"nanoarrow_c_schema_init", (DL_FUNC)&nanoarrow_c_schema_init, 2}, {"nanoarrow_c_schema_init_date_time", (DL_FUNC)&nanoarrow_c_schema_init_date_time, 4}, {"nanoarrow_c_schema_init_decimal", (DL_FUNC)&nanoarrow_c_schema_init_decimal, 4}, {"nanoarrow_c_schema_init_fixed_size", (DL_FUNC)&nanoarrow_c_schema_init_fixed_size, 3}, {"nanoarrow_c_schema_to_list", (DL_FUNC)&nanoarrow_c_schema_to_list, 1}, {"nanoarrow_c_schema_parse", (DL_FUNC)&nanoarrow_c_schema_parse, 1}, {"nanoarrow_c_schema_format", (DL_FUNC)&nanoarrow_c_schema_format, 2}, {"nanoarrow_c_schema_set_format", (DL_FUNC)&nanoarrow_c_schema_set_format, 2}, {"nanoarrow_c_schema_set_name", (DL_FUNC)&nanoarrow_c_schema_set_name, 2}, {"nanoarrow_c_schema_set_metadata", (DL_FUNC)&nanoarrow_c_schema_set_metadata, 2}, {"nanoarrow_c_schema_set_flags", (DL_FUNC)&nanoarrow_c_schema_set_flags, 2}, {"nanoarrow_c_schema_set_children", (DL_FUNC)&nanoarrow_c_schema_set_children, 2}, {"nanoarrow_c_schema_set_dictionary", (DL_FUNC)&nanoarrow_c_schema_set_dictionary, 2}, {"nanoarrow_c_preserved_count", (DL_FUNC)&nanoarrow_c_preserved_count, 0}, {"nanoarrow_c_preserved_empty", (DL_FUNC)&nanoarrow_c_preserved_empty, 0}, {"nanoarrow_c_preserve_and_release_on_other_thread", (DL_FUNC)&nanoarrow_c_preserve_and_release_on_other_thread, 1}, {"nanoarrow_c_version", (DL_FUNC)&nanoarrow_c_version, 0}, {"nanoarrow_c_version_runtime", (DL_FUNC)&nanoarrow_c_version_runtime, 0}, {NULL, NULL, 0}}; /* end generated by tools/make-callentries.R */ void R_init_nanoarrow(DllInfo* dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); nanoarrow_init_cached_sexps(); nanoarrow_preserve_init(); register_nanoarrow_altrep(dll); } nanoarrow/src/convert.c0000644000176200001440000004152714547575511014722 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "nanoarrow.h" #include "array.h" #include "convert.h" #include "materialize.h" #include "schema.h" static R_xlen_t nanoarrow_vec_size(SEXP vec_sexp, struct PTypeView* ptype_view) { if (ptype_view->vector_type == VECTOR_TYPE_DATA_FRAME) { return nanoarrow_data_frame_size(vec_sexp); } else { return Rf_xlength(vec_sexp); } } static void finalize_converter(SEXP converter_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); if (converter != NULL) { ArrowArrayViewReset(&converter->array_view); if (converter->children != NULL) { ArrowFree(converter->children); } ArrowFree(converter); } } SEXP nanoarrow_converter_from_type(enum VectorType vector_type) { struct RConverter* converter = (struct RConverter*)ArrowMalloc(sizeof(struct RConverter)); if (converter == NULL) { Rf_error("Failed to allocate RConverter"); } // 0: ptype, 1: schema_xptr, 2: array_xptr, 3: children, 4: result SEXP converter_shelter = PROTECT(Rf_allocVector(VECSXP, 5)); SEXP converter_xptr = PROTECT(R_MakeExternalPtr(converter, R_NilValue, converter_shelter)); R_RegisterCFinalizer(converter_xptr, &finalize_converter); ArrowArrayViewInitFromType(&converter->array_view, NANOARROW_TYPE_UNINITIALIZED); converter->schema_view.type = NANOARROW_TYPE_UNINITIALIZED; converter->schema_view.storage_type = NANOARROW_TYPE_UNINITIALIZED; converter->src.array_view = &converter->array_view; converter->dst.vec_sexp = R_NilValue; converter->options = NULL; converter->error.message[0] = '\0'; converter->size = 0; converter->capacity = 0; converter->n_children = 0; converter->children = NULL; converter->ptype_view.vector_type = vector_type; converter->ptype_view.ptype = R_NilValue; switch (vector_type) { case VECTOR_TYPE_NULL: converter->ptype_view.sexp_type = NILSXP; break; case VECTOR_TYPE_LGL: converter->ptype_view.sexp_type = LGLSXP; break; case VECTOR_TYPE_INT: converter->ptype_view.sexp_type = INTSXP; break; case VECTOR_TYPE_DBL: converter->ptype_view.sexp_type = REALSXP; break; case VECTOR_TYPE_CHR: converter->ptype_view.sexp_type = STRSXP; break; default: UNPROTECT(2); return R_NilValue; } UNPROTECT(2); return converter_xptr; } static enum RTimeUnits time_units_from_difftime(SEXP ptype) { SEXP units_attr = Rf_getAttrib(ptype, Rf_install("units")); if (units_attr == R_NilValue || TYPEOF(units_attr) != STRSXP || Rf_length(units_attr) != 1) { Rf_error("Expected difftime 'units' attribute of type character(1)"); } const char* dst_units = Rf_translateCharUTF8(STRING_ELT(units_attr, 0)); if (strcmp(dst_units, "secs") == 0) { return R_TIME_UNIT_SECONDS; } else if (strcmp(dst_units, "mins") == 0) { return R_TIME_UNIT_MINUTES; } else if (strcmp(dst_units, "hours") == 0) { return R_TIME_UNIT_HOURS; } else if (strcmp(dst_units, "days") == 0) { return R_TIME_UNIT_DAYS; } else if (strcmp(dst_units, "weeks") == 0) { return R_TIME_UNIT_WEEKS; } else { Rf_error("Unexpected value for difftime 'units' attribute"); return R_TIME_UNIT_SECONDS; } } static void set_converter_data_frame(SEXP converter_xptr, struct RConverter* converter, SEXP ptype) { converter->n_children = Rf_xlength(ptype); converter->children = (struct RConverter**)ArrowMalloc(converter->n_children * sizeof(struct RConverter*)); if (converter->children == NULL) { Rf_error("Failed to allocate converter children array"); } SEXP child_converter_xptrs = PROTECT(Rf_allocVector(VECSXP, converter->n_children)); for (R_xlen_t i = 0; i < converter->n_children; i++) { SEXP child_ptype = VECTOR_ELT(ptype, i); SEXP child_converter = PROTECT(nanoarrow_converter_from_ptype(child_ptype)); converter->children[i] = (struct RConverter*)R_ExternalPtrAddr(child_converter); SET_VECTOR_ELT(child_converter_xptrs, i, child_converter); UNPROTECT(1); } SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SET_VECTOR_ELT(converter_shelter, 3, child_converter_xptrs); UNPROTECT(1); } static void set_converter_list_of(SEXP converter_xptr, struct RConverter* converter, SEXP ptype) { SEXP child_ptype = Rf_getAttrib(ptype, Rf_install("ptype")); if (child_ptype == R_NilValue) { Rf_error("Expected attribute 'ptype' for conversion to list_of"); } converter->children = (struct RConverter**)ArrowMalloc(1 * sizeof(struct RConverter*)); if (converter->children == NULL) { Rf_error("Failed to allocate converter children array"); } converter->n_children = 1; SEXP child_converter_xptrs = PROTECT(Rf_allocVector(VECSXP, 1)); SEXP child_converter = PROTECT(nanoarrow_converter_from_ptype(child_ptype)); converter->children[0] = (struct RConverter*)R_ExternalPtrAddr(child_converter); SET_VECTOR_ELT(child_converter_xptrs, 0, child_converter); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SET_VECTOR_ELT(converter_shelter, 3, child_converter_xptrs); UNPROTECT(2); } static int set_converter_children_schema(SEXP converter_xptr, SEXP schema_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); if (schema->n_children != converter->n_children) { ArrowErrorSet(&converter->error, "Expected schema with %ld children but got schema with %ld children", (long)converter->n_children, (long)schema->n_children); return EINVAL; } SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3); for (R_xlen_t i = 0; i < converter->n_children; i++) { SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i); SEXP child_schema_xptr = PROTECT(borrow_schema_child_xptr(schema_xptr, i)); int result = nanoarrow_converter_set_schema(child_converter_xptr, child_schema_xptr); UNPROTECT(1); if (result != NANOARROW_OK) { return result; } } return NANOARROW_OK; } static int set_converter_children_array(SEXP converter_xptr, SEXP array_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); if (array->n_children != converter->n_children) { ArrowErrorSet(&converter->error, "Expected array with %ld children but got array with %ld children", (long)converter->n_children, (long)array->n_children); return EINVAL; } SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3); for (R_xlen_t i = 0; i < converter->n_children; i++) { SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i); SEXP child_array_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i)); int result = nanoarrow_converter_set_array(child_converter_xptr, child_array_xptr); UNPROTECT(1); if (result != NANOARROW_OK) { return result; } } return NANOARROW_OK; } SEXP nanoarrow_converter_from_ptype(SEXP ptype) { SEXP converter_xptr = PROTECT(nanoarrow_converter_from_type(VECTOR_TYPE_NULL)); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); if (Rf_isObject(ptype)) { if (nanoarrow_ptype_is_data_frame(ptype)) { converter->ptype_view.vector_type = VECTOR_TYPE_DATA_FRAME; set_converter_data_frame(converter_xptr, converter, ptype); } else if (Rf_inherits(ptype, "blob")) { converter->ptype_view.vector_type = VECTOR_TYPE_BLOB; } else if (Rf_inherits(ptype, "vctrs_list_of")) { converter->ptype_view.vector_type = VECTOR_TYPE_LIST_OF; set_converter_list_of(converter_xptr, converter, ptype); } else if (Rf_inherits(ptype, "vctrs_unspecified")) { converter->ptype_view.vector_type = VECTOR_TYPE_UNSPECIFIED; } else if (Rf_inherits(ptype, "Date")) { converter->ptype_view.vector_type = VECTOR_TYPE_DATE; converter->ptype_view.r_time_units = R_TIME_UNIT_DAYS; } else if (Rf_inherits(ptype, "POSIXct")) { converter->ptype_view.vector_type = VECTOR_TYPE_POSIXCT; converter->ptype_view.r_time_units = R_TIME_UNIT_SECONDS; } else if (Rf_inherits(ptype, "difftime")) { converter->ptype_view.vector_type = VECTOR_TYPE_DIFFTIME; converter->ptype_view.r_time_units = time_units_from_difftime(ptype); } else if (Rf_inherits(ptype, "integer64")) { converter->ptype_view.vector_type = VECTOR_TYPE_INTEGER64; } else { converter->ptype_view.vector_type = VECTOR_TYPE_OTHER; } } else { switch (TYPEOF(ptype)) { case LGLSXP: converter->ptype_view.vector_type = VECTOR_TYPE_LGL; break; case INTSXP: converter->ptype_view.vector_type = VECTOR_TYPE_INT; break; case REALSXP: converter->ptype_view.vector_type = VECTOR_TYPE_DBL; break; case STRSXP: converter->ptype_view.vector_type = VECTOR_TYPE_CHR; break; default: converter->ptype_view.vector_type = VECTOR_TYPE_OTHER; break; } } converter->ptype_view.ptype = ptype; converter->ptype_view.sexp_type = TYPEOF(ptype); SET_VECTOR_ELT(converter_shelter, 0, ptype); UNPROTECT(1); return converter_xptr; } int nanoarrow_converter_set_schema(SEXP converter_xptr, SEXP schema_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); NANOARROW_RETURN_NOT_OK( ArrowSchemaViewInit(&converter->schema_view, schema, &converter->error)); // TODO: Currently we error at the materialize stage if a conversion is not possible; // however, at this stage we have all the information we need to calculate that. SET_VECTOR_ELT(converter_shelter, 1, schema_xptr); ArrowArrayViewReset(&converter->array_view); SET_VECTOR_ELT(converter_shelter, 2, R_NilValue); NANOARROW_RETURN_NOT_OK( ArrowArrayViewInitFromSchema(&converter->array_view, schema, &converter->error)); if (converter->ptype_view.vector_type == VECTOR_TYPE_LIST_OF || converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) { set_converter_children_schema(converter_xptr, schema_xptr); } return NANOARROW_OK; } int nanoarrow_converter_set_array(SEXP converter_xptr, SEXP array_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); NANOARROW_RETURN_NOT_OK( ArrowArrayViewSetArray(&converter->array_view, array, &converter->error)); SET_VECTOR_ELT(converter_shelter, 2, array_xptr); converter->src.offset = 0; converter->src.length = 0; if (converter->ptype_view.vector_type == VECTOR_TYPE_LIST_OF || converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) { set_converter_children_array(converter_xptr, array_xptr); } return NANOARROW_OK; } void sync_after_converter_reallocate(SEXP converter_xptr, struct RConverter* converter, SEXP result_sexp, R_xlen_t capacity) { SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SET_VECTOR_ELT(converter_shelter, 4, result_sexp); converter->dst.vec_sexp = result_sexp; converter->dst.offset = 0; converter->dst.length = 0; converter->size = 0; converter->capacity = capacity; if (converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) { SEXP child_converters = VECTOR_ELT(converter_shelter, 3); for (R_xlen_t i = 0; i < converter->n_children; i++) { sync_after_converter_reallocate(VECTOR_ELT(child_converters, i), converter->children[i], VECTOR_ELT(result_sexp, i), capacity); } } } int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SEXP current_result = VECTOR_ELT(converter_shelter, 4); if (current_result != R_NilValue) { ArrowErrorSet(&converter->error, "Reallocation in converter is not implemented"); return ENOTSUP; } SEXP result_sexp; if (converter->ptype_view.ptype != R_NilValue) { result_sexp = PROTECT( nanoarrow_materialize_realloc(converter->ptype_view.ptype, additional_size)); } else { result_sexp = PROTECT(nanoarrow_alloc_type(converter->ptype_view.vector_type, additional_size)); } sync_after_converter_reallocate(converter_xptr, converter, result_sexp, additional_size); UNPROTECT(1); return NANOARROW_OK; } R_xlen_t nanoarrow_converter_materialize_n(SEXP converter_xptr, R_xlen_t n) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); if ((converter->dst.offset + n) > converter->capacity) { n = converter->capacity - converter->dst.offset; } if ((converter->src.offset + n) > converter->array_view.array->length) { n = converter->array_view.array->length - converter->src.offset; } if (n == 0) { return 0; } converter->src.length = converter->dst.length = n; int result = nanoarrow_materialize(converter, converter_xptr); if (result != NANOARROW_OK) { ArrowErrorSet(&converter->error, "Error in nanoarrow_materialize()"); return 0; } converter->src.offset += n; converter->dst.offset += n; converter->size += n; return n; } int nanoarrow_converter_materialize_all(SEXP converter_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); R_xlen_t remaining = converter->array_view.array->length; NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, remaining)); if (nanoarrow_converter_materialize_n(converter_xptr, remaining) != remaining) { return ERANGE; } else { return NANOARROW_OK; } } int nanoarrow_converter_finalize(SEXP converter_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); SEXP current_result = VECTOR_ELT(converter_shelter, 4); // Materialize never called (e.g., empty stream) if (current_result == R_NilValue) { NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0)); current_result = VECTOR_ELT(converter_shelter, 4); } // Check result size. A future implementation could also shrink the length // or reallocate a shorter vector. R_xlen_t current_result_size = nanoarrow_vec_size(current_result, &converter->ptype_view); if (current_result_size != converter->size) { ArrowErrorSet(&converter->error, "Expected result of size %ld but got result of size %ld", (long)current_result_size, (long)converter->size); return ENOTSUP; } return NANOARROW_OK; } SEXP nanoarrow_converter_release_result(SEXP converter_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr); // PROTECT()ing here because we are about to release the object from the // shelter of the converter and return it SEXP result = PROTECT(VECTOR_ELT(converter_shelter, 4)); SET_VECTOR_ELT(converter_shelter, 4, R_NilValue); converter->dst.vec_sexp = R_NilValue; converter->dst.offset = 0; converter->dst.length = 0; converter->size = 0; converter->capacity = 0; UNPROTECT(1); return result; } void nanoarrow_converter_stop(SEXP converter_xptr) { struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); Rf_error("%s", ArrowErrorMessage(&converter->error)); } nanoarrow/src/materialize_posixct.h0000644000176200001440000000462414355121773017315 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_POSIXCT_H_INCLUDED #define R_MATERIALIZE_POSIXCT_H_INCLUDED #include #include #include "materialize_common.h" #include "materialize_dbl.h" #include "nanoarrow.h" static inline int nanoarrow_materialize_posixct(struct RConverter* converter) { if (converter->ptype_view.sexp_type == REALSXP) { enum ArrowTimeUnit time_unit; switch (converter->schema_view.type) { case NANOARROW_TYPE_NA: time_unit = NANOARROW_TIME_UNIT_SECOND; NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_dbl(converter)); break; case NANOARROW_TYPE_DATE64: time_unit = NANOARROW_TIME_UNIT_MILLI; NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_dbl(converter)); break; case NANOARROW_TYPE_TIMESTAMP: time_unit = converter->schema_view.time_unit; NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_dbl(converter)); break; default: return EINVAL; } double scale; switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: scale = 1; break; case NANOARROW_TIME_UNIT_MILLI: scale = 1e-3; break; case NANOARROW_TIME_UNIT_MICRO: scale = 1e-6; break; case NANOARROW_TIME_UNIT_NANO: scale = 1e-9; break; default: return EINVAL; } if (scale != 1) { double* result = REAL(converter->dst.vec_sexp); for (int64_t i = 0; i < converter->dst.length; i++) { result[converter->dst.offset + i] = result[converter->dst.offset + i] * scale; } } return NANOARROW_OK; } return EINVAL; } #endif nanoarrow/src/infer_ptype.c0000644000176200001440000001202714547575511015557 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "nanoarrow.h" #include "altrep.h" #include "array.h" #include "array_view.h" #include "materialize.h" #include "schema.h" #include "util.h" // These conversions are the default R-native type guesses for // an array that don't require extra information from the ptype (e.g., // factor with levels). Some of these guesses may result in a conversion // that later warns for out-of-range values (e.g., int64 to double()); // however, a user can use the convert_array(x, ptype = something_safer()) // when this occurs. enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_BOOL: return VECTOR_TYPE_LGL; case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT32: return VECTOR_TYPE_INT; case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_FLOAT: case NANOARROW_TYPE_DOUBLE: case NANOARROW_TYPE_DECIMAL128: return VECTOR_TYPE_DBL; case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: return VECTOR_TYPE_CHR; case NANOARROW_TYPE_DENSE_UNION: case NANOARROW_TYPE_SPARSE_UNION: case NANOARROW_TYPE_STRUCT: return VECTOR_TYPE_DATA_FRAME; default: return VECTOR_TYPE_OTHER; } } // The same as the above, but from a nanoarrow_schema() enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); struct ArrowSchemaView schema_view; struct ArrowError error; if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) { Rf_error("nanoarrow_infer_vector_type_schema(): %s", ArrowErrorMessage(&error)); } if (schema_view.extension_name.size_bytes > 0) { return VECTOR_TYPE_OTHER; } else { return nanoarrow_infer_vector_type(schema_view.type); } } // The same as the above, but from a nanoarrow_array() enum VectorType nanoarrow_infer_vector_type_array(SEXP array_xptr) { return nanoarrow_infer_vector_type_schema(array_xptr_get_schema(array_xptr)); } // Call nanoarrow::infer_ptype_other(), which handles less common types that // are easier to compute in R or gives an informative error if this is // not possible. static SEXP call_infer_ptype_other(SEXP schema_xptr) { SEXP fun = PROTECT(Rf_install("infer_ptype_other")); SEXP call = PROTECT(Rf_lang2(fun, schema_xptr)); SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); UNPROTECT(3); return result; } SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); static SEXP infer_ptype_data_frame(SEXP schema_xptr) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); SEXP result = PROTECT(Rf_allocVector(VECSXP, schema->n_children)); SEXP result_names = PROTECT(Rf_allocVector(STRSXP, schema->n_children)); for (R_xlen_t i = 0; i < schema->n_children; i++) { SEXP child_xptr = PROTECT(borrow_schema_child_xptr(schema_xptr, i)); SET_VECTOR_ELT(result, i, nanoarrow_c_infer_ptype(child_xptr)); UNPROTECT(1); struct ArrowSchema* child = schema->children[i]; if (child->name != NULL) { SET_STRING_ELT(result_names, i, Rf_mkCharCE(child->name, CE_UTF8)); } else { SET_STRING_ELT(result_names, i, Rf_mkChar("")); } } Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame); Rf_setAttrib(result, R_NamesSymbol, result_names); SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2)); INTEGER(rownames)[0] = NA_INTEGER; INTEGER(rownames)[1] = 0; Rf_setAttrib(result, R_RowNamesSymbol, rownames); UNPROTECT(3); return result; } SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr) { enum VectorType vector_type = nanoarrow_infer_vector_type_schema(schema_xptr); SEXP ptype = R_NilValue; switch (vector_type) { case VECTOR_TYPE_LGL: case VECTOR_TYPE_INT: case VECTOR_TYPE_DBL: case VECTOR_TYPE_CHR: ptype = PROTECT(nanoarrow_alloc_type(vector_type, 0)); break; case VECTOR_TYPE_DATA_FRAME: ptype = PROTECT(infer_ptype_data_frame(schema_xptr)); break; default: ptype = PROTECT(call_infer_ptype_other(schema_xptr)); break; } UNPROTECT(1); return ptype; } nanoarrow/src/materialize_date.h0000644000176200001440000000251414502402562016525 0ustar liggesusers // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_DATE_H_INCLUDED #define R_MATERIALIZE_DATE_H_INCLUDED #include #include #include "materialize_common.h" #include "materialize_dbl.h" #include "nanoarrow.h" static int nanoarrow_materialize_date(struct RConverter* converter) { if (converter->ptype_view.sexp_type == REALSXP) { switch (converter->schema_view.type) { case NANOARROW_TYPE_NA: case NANOARROW_TYPE_DATE32: return nanoarrow_materialize_dbl(converter); default: break; } } return ENOTSUP; } #endif nanoarrow/src/nanoarrow.c0000644000176200001440000034002214556775567015255 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include #include #include #include #include #include #include "nanoarrow.h" const char* ArrowNanoarrowVersion(void) { return NANOARROW_VERSION; } int ArrowNanoarrowVersionInt(void) { return NANOARROW_VERSION_INT; } ArrowErrorCode ArrowErrorSet(struct ArrowError* error, const char* fmt, ...) { if (error == NULL) { return NANOARROW_OK; } memset(error->message, 0, sizeof(error->message)); va_list args; va_start(args, fmt); int chars_needed = vsnprintf(error->message, sizeof(error->message), fmt, args); va_end(args); if (chars_needed < 0) { return EINVAL; } else if (((size_t)chars_needed) >= sizeof(error->message)) { return ERANGE; } else { return NANOARROW_OK; } } void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type) { layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_VALIDITY; layout->buffer_data_type[0] = NANOARROW_TYPE_BOOL; layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA; layout->buffer_data_type[1] = storage_type; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_NONE; layout->buffer_data_type[2] = NANOARROW_TYPE_UNINITIALIZED; layout->element_size_bits[0] = 1; layout->element_size_bits[1] = 0; layout->element_size_bits[2] = 0; layout->child_size_elements = 0; switch (storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE; layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED; layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; layout->element_size_bits[0] = 0; break; case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_MAP: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; case NANOARROW_TYPE_LARGE_LIST: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; layout->element_size_bits[1] = 64; break; case NANOARROW_TYPE_STRUCT: case NANOARROW_TYPE_FIXED_SIZE_LIST: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; break; case NANOARROW_TYPE_BOOL: layout->element_size_bits[1] = 1; break; case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT8: layout->element_size_bits[1] = 8; break; case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_HALF_FLOAT: layout->element_size_bits[1] = 16; break; case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_FLOAT: layout->element_size_bits[1] = 32; break; case NANOARROW_TYPE_INTERVAL_MONTHS: layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_DOUBLE: case NANOARROW_TYPE_INTERVAL_DAY_TIME: layout->element_size_bits[1] = 64; break; case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: layout->element_size_bits[1] = 128; break; case NANOARROW_TYPE_DECIMAL256: layout->element_size_bits[1] = 256; break; case NANOARROW_TYPE_FIXED_SIZE_BINARY: layout->buffer_data_type[1] = NANOARROW_TYPE_BINARY; break; case NANOARROW_TYPE_DENSE_UNION: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; layout->element_size_bits[0] = 8; layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_UNION_OFFSET; layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; break; case NANOARROW_TYPE_SPARSE_UNION: layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_TYPE_ID; layout->buffer_data_type[0] = NANOARROW_TYPE_INT8; layout->element_size_bits[0] = 8; layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE; layout->buffer_data_type[1] = NANOARROW_TYPE_UNINITIALIZED; break; case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; layout->buffer_data_type[1] = NANOARROW_TYPE_INT32; layout->element_size_bits[1] = 32; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; layout->buffer_data_type[2] = storage_type; break; case NANOARROW_TYPE_LARGE_STRING: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; layout->element_size_bits[1] = 64; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; layout->buffer_data_type[2] = NANOARROW_TYPE_STRING; break; case NANOARROW_TYPE_LARGE_BINARY: layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_DATA_OFFSET; layout->buffer_data_type[1] = NANOARROW_TYPE_INT64; layout->element_size_bits[1] = 64; layout->buffer_type[2] = NANOARROW_BUFFER_TYPE_DATA; layout->buffer_data_type[2] = NANOARROW_TYPE_BINARY; break; default: break; } } void* ArrowMalloc(int64_t size) { return malloc(size); } void* ArrowRealloc(void* ptr, int64_t size) { return realloc(ptr, size); } void ArrowFree(void* ptr) { free(ptr); } static uint8_t* ArrowBufferAllocatorMallocReallocate( struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, int64_t new_size) { NANOARROW_UNUSED(allocator); NANOARROW_UNUSED(old_size); return (uint8_t*)ArrowRealloc(ptr, new_size); } static void ArrowBufferAllocatorMallocFree(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size) { NANOARROW_UNUSED(allocator); NANOARROW_UNUSED(size); ArrowFree(ptr); } static struct ArrowBufferAllocator ArrowBufferAllocatorMalloc = { &ArrowBufferAllocatorMallocReallocate, &ArrowBufferAllocatorMallocFree, NULL}; struct ArrowBufferAllocator ArrowBufferAllocatorDefault(void) { return ArrowBufferAllocatorMalloc; } static uint8_t* ArrowBufferAllocatorNeverReallocate( struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size, int64_t new_size) { NANOARROW_UNUSED(allocator); NANOARROW_UNUSED(ptr); NANOARROW_UNUSED(old_size); NANOARROW_UNUSED(new_size); return NULL; } struct ArrowBufferAllocator ArrowBufferDeallocator( void (*custom_free)(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size), void* private_data) { struct ArrowBufferAllocator allocator; allocator.reallocate = &ArrowBufferAllocatorNeverReallocate; allocator.free = custom_free; allocator.private_data = private_data; return allocator; } static const int kInt32DecimalDigits = 9; static const uint64_t kUInt32PowersOfTen[] = { 1ULL, 10ULL, 100ULL, 1000ULL, 10000ULL, 100000ULL, 1000000ULL, 10000000ULL, 100000000ULL, 1000000000ULL}; // Adapted from Arrow C++ to use 32-bit words for better C portability // https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L524-L544 static void ShiftAndAdd(struct ArrowStringView value, uint32_t* out, int64_t out_size) { // We use strtoll for parsing, which needs input that is null-terminated char chunk_string[16]; for (int64_t posn = 0; posn < value.size_bytes;) { int64_t remaining = value.size_bytes - posn; int64_t group_size; if (remaining > kInt32DecimalDigits) { group_size = kInt32DecimalDigits; } else { group_size = remaining; } const uint64_t multiple = kUInt32PowersOfTen[group_size]; memcpy(chunk_string, value.data + posn, group_size); chunk_string[group_size] = '\0'; uint32_t chunk = (uint32_t)strtoll(chunk_string, NULL, 10); for (int64_t i = 0; i < out_size; i++) { uint64_t tmp = out[i]; tmp *= multiple; tmp += chunk; out[i] = (uint32_t)(tmp & 0xFFFFFFFFULL); chunk = (uint32_t)(tmp >> 32); } posn += group_size; } } ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal, struct ArrowStringView value) { // Check for sign int is_negative = value.data[0] == '-'; int has_sign = is_negative || value.data[0] == '+'; value.data += has_sign; value.size_bytes -= has_sign; // Check all characters are digits that are not the negative sign for (int64_t i = 0; i < value.size_bytes; i++) { char c = value.data[i]; if (c < '0' || c > '9') { return EINVAL; } } // Skip over leading 0s int64_t n_leading_zeroes = 0; for (int64_t i = 0; i < value.size_bytes; i++) { if (value.data[i] == '0') { n_leading_zeroes++; } else { break; } } value.data += n_leading_zeroes; value.size_bytes -= n_leading_zeroes; // Use 32-bit words for portability uint32_t words32[8]; int n_words32 = decimal->n_words * 2; NANOARROW_DCHECK(n_words32 <= 8); memset(words32, 0, sizeof(words32)); ShiftAndAdd(value, words32, n_words32); if (decimal->low_word_index == 0) { memcpy(decimal->words, words32, sizeof(uint32_t) * n_words32); } else { uint64_t lo; uint64_t hi; for (int i = 0; i < decimal->n_words; i++) { lo = (uint64_t)words32[i * 2]; hi = (uint64_t)words32[i * 2 + 1] << 32; decimal->words[decimal->n_words - i - 1] = lo | hi; } } if (is_negative) { ArrowDecimalNegate(decimal); } return NANOARROW_OK; } // Adapted from Arrow C++ for C // https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L365 ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal, struct ArrowBuffer* buffer) { int is_negative = ArrowDecimalSign(decimal) < 0; uint64_t words_little_endian[4]; if (decimal->low_word_index == 0) { memcpy(words_little_endian, decimal->words, decimal->n_words * sizeof(uint64_t)); } else { for (int i = 0; i < decimal->n_words; i++) { words_little_endian[i] = decimal->words[decimal->n_words - i - 1]; } } // We've already made a copy, so negate that if needed if (is_negative) { uint64_t carry = 1; for (int i = 0; i < decimal->n_words; i++) { uint64_t elem = words_little_endian[i]; elem = ~elem + carry; carry &= (elem == 0); words_little_endian[i] = elem; } } // Find the most significant word that is non-zero int most_significant_elem_idx = -1; for (int i = decimal->n_words - 1; i >= 0; i--) { if (words_little_endian[i] != 0) { most_significant_elem_idx = i; break; } } // If they are all zero, the output is just '0' if (most_significant_elem_idx == -1) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(buffer, '0')); return NANOARROW_OK; } // Define segments such that each segment represents 9 digits with the // least significant group of 9 digits first. For example, if the input represents // 9876543210123456789, then segments will be [123456789, 876543210, 9]. // We handle at most a signed 256 bit integer, whose maximum value occupies 77 // characters. Thus, we need at most 9 segments. const uint32_t k1e9 = 1000000000U; int num_segments = 0; uint32_t segments[9]; memset(segments, 0, sizeof(segments)); uint64_t* most_significant_elem = words_little_endian + most_significant_elem_idx; do { // Compute remainder = words_little_endian % 1e9 and words_little_endian = // words_little_endian / 1e9. uint32_t remainder = 0; uint64_t* elem = most_significant_elem; do { // Compute dividend = (remainder << 32) | *elem (a virtual 96-bit integer); // *elem = dividend / 1e9; // remainder = dividend % 1e9. uint32_t hi = (uint32_t)(*elem >> 32); uint32_t lo = (uint32_t)(*elem & 0xFFFFFFFFULL); uint64_t dividend_hi = ((uint64_t)(remainder) << 32) | hi; uint64_t quotient_hi = dividend_hi / k1e9; remainder = (uint32_t)(dividend_hi % k1e9); uint64_t dividend_lo = ((uint64_t)(remainder) << 32) | lo; uint64_t quotient_lo = dividend_lo / k1e9; remainder = (uint32_t)(dividend_lo % k1e9); *elem = (quotient_hi << 32) | quotient_lo; } while (elem-- != words_little_endian); segments[num_segments++] = remainder; } while (*most_significant_elem != 0 || most_significant_elem-- != words_little_endian); // We know our output has no more than 9 digits per segment, plus a negative sign, // plus any further digits between our output of 9 digits plus enough // extra characters to ensure that snprintf() with n = 21 (maximum length of %lu // including a the null terminator) is bounded properly. NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, num_segments * 9 + 1 + 21 - 9)); if (is_negative) { buffer->data[buffer->size_bytes++] = '-'; } // The most significant segment should have no leading zeroes int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%lu", (unsigned long)segments[num_segments - 1]); buffer->size_bytes += n_chars; // Subsequent output needs to be left-padded with zeroes such that each segment // takes up exactly 9 digits. for (int i = num_segments - 2; i >= 0; i--) { int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%09lu", (unsigned long)segments[i]); buffer->size_bytes += n_chars; NANOARROW_DCHECK(buffer->size_bytes <= buffer->capacity_bytes); } return NANOARROW_OK; } // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include #include #include #include #include "nanoarrow.h" static void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) { if (schema->format != NULL) ArrowFree((void*)schema->format); if (schema->name != NULL) ArrowFree((void*)schema->name); if (schema->metadata != NULL) ArrowFree((void*)schema->metadata); // This object owns the memory for all the children, but those // children may have been generated elsewhere and might have // their own release() callback. if (schema->children != NULL) { for (int64_t i = 0; i < schema->n_children; i++) { if (schema->children[i] != NULL) { if (schema->children[i]->release != NULL) { ArrowSchemaRelease(schema->children[i]); } ArrowFree(schema->children[i]); } } ArrowFree(schema->children); } // This object owns the memory for the dictionary but it // may have been generated somewhere else and have its own // release() callback. if (schema->dictionary != NULL) { if (schema->dictionary->release != NULL) { ArrowSchemaRelease(schema->dictionary); } ArrowFree(schema->dictionary); } // private data not currently used if (schema->private_data != NULL) { ArrowFree(schema->private_data); } schema->release = NULL; } static const char* ArrowSchemaFormatTemplate(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_UNINITIALIZED: return NULL; case NANOARROW_TYPE_NA: return "n"; case NANOARROW_TYPE_BOOL: return "b"; case NANOARROW_TYPE_UINT8: return "C"; case NANOARROW_TYPE_INT8: return "c"; case NANOARROW_TYPE_UINT16: return "S"; case NANOARROW_TYPE_INT16: return "s"; case NANOARROW_TYPE_UINT32: return "I"; case NANOARROW_TYPE_INT32: return "i"; case NANOARROW_TYPE_UINT64: return "L"; case NANOARROW_TYPE_INT64: return "l"; case NANOARROW_TYPE_HALF_FLOAT: return "e"; case NANOARROW_TYPE_FLOAT: return "f"; case NANOARROW_TYPE_DOUBLE: return "g"; case NANOARROW_TYPE_STRING: return "u"; case NANOARROW_TYPE_LARGE_STRING: return "U"; case NANOARROW_TYPE_BINARY: return "z"; case NANOARROW_TYPE_LARGE_BINARY: return "Z"; case NANOARROW_TYPE_DATE32: return "tdD"; case NANOARROW_TYPE_DATE64: return "tdm"; case NANOARROW_TYPE_INTERVAL_MONTHS: return "tiM"; case NANOARROW_TYPE_INTERVAL_DAY_TIME: return "tiD"; case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: return "tin"; case NANOARROW_TYPE_LIST: return "+l"; case NANOARROW_TYPE_LARGE_LIST: return "+L"; case NANOARROW_TYPE_STRUCT: return "+s"; case NANOARROW_TYPE_MAP: return "+m"; default: return NULL; } } static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema, enum ArrowType type) { switch (type) { case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_LARGE_LIST: case NANOARROW_TYPE_FIXED_SIZE_LIST: NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); ArrowSchemaInit(schema->children[0]); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "item")); break; case NANOARROW_TYPE_MAP: NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1)); NANOARROW_RETURN_NOT_OK( ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_STRUCT)); NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "entries")); schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE; NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema->children[0], 2)); ArrowSchemaInit(schema->children[0]->children[0]); ArrowSchemaInit(schema->children[0]->children[1]); NANOARROW_RETURN_NOT_OK( ArrowSchemaSetName(schema->children[0]->children[0], "key")); schema->children[0]->children[0]->flags &= ~ARROW_FLAG_NULLABLE; NANOARROW_RETURN_NOT_OK( ArrowSchemaSetName(schema->children[0]->children[1], "value")); break; default: break; } return NANOARROW_OK; } void ArrowSchemaInit(struct ArrowSchema* schema) { schema->format = NULL; schema->name = NULL; schema->metadata = NULL; schema->flags = ARROW_FLAG_NULLABLE; schema->n_children = 0; schema->children = NULL; schema->dictionary = NULL; schema->private_data = NULL; schema->release = &ArrowSchemaReleaseInternal; } ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) { // We don't allocate the dictionary because it has to be nullptr // for non-dictionary-encoded arrays. // Set the format to a valid format string for type const char* template_format = ArrowSchemaFormatTemplate(type); // If type isn't recognized and not explicitly unset if (template_format == NULL && type != NANOARROW_TYPE_UNINITIALIZED) { return EINVAL; } NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, template_format)); // For types with an umabiguous child structure, allocate children return ArrowSchemaInitChildrenIfNeeded(schema, type); } ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children) { NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRUCT)); NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); for (int64_t i = 0; i < n_children; i++) { ArrowSchemaInit(schema->children[i]); } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type) { ArrowSchemaInit(schema); int result = ArrowSchemaSetType(schema, type); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema); return result; } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema, enum ArrowType type, int32_t fixed_size) { if (fixed_size <= 0) { return EINVAL; } char buffer[64]; int n_chars; switch (type) { case NANOARROW_TYPE_FIXED_SIZE_BINARY: n_chars = snprintf(buffer, sizeof(buffer), "w:%d", (int)fixed_size); break; case NANOARROW_TYPE_FIXED_SIZE_LIST: n_chars = snprintf(buffer, sizeof(buffer), "+w:%d", (int)fixed_size); break; default: return EINVAL; } buffer[n_chars] = '\0'; NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer)); if (type == NANOARROW_TYPE_FIXED_SIZE_LIST) { NANOARROW_RETURN_NOT_OK(ArrowSchemaInitChildrenIfNeeded(schema, type)); } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type, int32_t decimal_precision, int32_t decimal_scale) { if (decimal_precision <= 0) { return EINVAL; } char buffer[64]; int n_chars; switch (type) { case NANOARROW_TYPE_DECIMAL128: n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d", decimal_precision, decimal_scale); break; case NANOARROW_TYPE_DECIMAL256: n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d,256", decimal_precision, decimal_scale); break; default: return EINVAL; } buffer[n_chars] = '\0'; return ArrowSchemaSetFormat(schema, buffer); } static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) { switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: return "s"; case NANOARROW_TIME_UNIT_MILLI: return "m"; case NANOARROW_TIME_UNIT_MICRO: return "u"; case NANOARROW_TIME_UNIT_NANO: return "n"; default: return NULL; } } ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type, enum ArrowTimeUnit time_unit, const char* timezone) { const char* time_unit_str = ArrowTimeUnitFormatString(time_unit); if (time_unit_str == NULL) { return EINVAL; } char buffer[128]; int n_chars; switch (type) { case NANOARROW_TYPE_TIME32: if (timezone != NULL) { return EINVAL; } switch (time_unit) { case NANOARROW_TIME_UNIT_MICRO: case NANOARROW_TIME_UNIT_NANO: return EINVAL; default: break; } n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); break; case NANOARROW_TYPE_TIME64: if (timezone != NULL) { return EINVAL; } switch (time_unit) { case NANOARROW_TIME_UNIT_SECOND: case NANOARROW_TIME_UNIT_MILLI: return EINVAL; default: break; } n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str); break; case NANOARROW_TYPE_TIMESTAMP: if (timezone == NULL) { timezone = ""; } n_chars = snprintf(buffer, sizeof(buffer), "ts%s:%s", time_unit_str, timezone); break; case NANOARROW_TYPE_DURATION: if (timezone != NULL) { return EINVAL; } n_chars = snprintf(buffer, sizeof(buffer), "tD%s", time_unit_str); break; default: return EINVAL; } if (((size_t)n_chars) >= sizeof(buffer)) { return ERANGE; } buffer[n_chars] = '\0'; return ArrowSchemaSetFormat(schema, buffer); } ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type, int64_t n_children) { if (n_children < 0 || n_children > 127) { return EINVAL; } // Max valid size would be +ud:0,1,...126 = 401 characters + null terminator char format_out[512]; int64_t format_out_size = 512; memset(format_out, 0, format_out_size); int n_chars; char* format_cursor = format_out; switch (type) { case NANOARROW_TYPE_SPARSE_UNION: n_chars = snprintf(format_cursor, format_out_size, "+us:"); format_cursor += n_chars; format_out_size -= n_chars; break; case NANOARROW_TYPE_DENSE_UNION: n_chars = snprintf(format_cursor, format_out_size, "+ud:"); format_cursor += n_chars; format_out_size -= n_chars; break; default: return EINVAL; } if (n_children > 0) { n_chars = snprintf(format_cursor, format_out_size, "0"); format_cursor += n_chars; format_out_size -= n_chars; for (int64_t i = 1; i < n_children; i++) { n_chars = snprintf(format_cursor, format_out_size, ",%d", (int)i); format_cursor += n_chars; format_out_size -= n_chars; } } NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out)); NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children)); for (int64_t i = 0; i < n_children; i++) { ArrowSchemaInit(schema->children[i]); } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format) { if (schema->format != NULL) { ArrowFree((void*)schema->format); } if (format != NULL) { size_t format_size = strlen(format) + 1; schema->format = (const char*)ArrowMalloc(format_size); if (schema->format == NULL) { return ENOMEM; } memcpy((void*)schema->format, format, format_size); } else { schema->format = NULL; } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name) { if (schema->name != NULL) { ArrowFree((void*)schema->name); } if (name != NULL) { size_t name_size = strlen(name) + 1; schema->name = (const char*)ArrowMalloc(name_size); if (schema->name == NULL) { return ENOMEM; } memcpy((void*)schema->name, name, name_size); } else { schema->name = NULL; } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata) { if (schema->metadata != NULL) { ArrowFree((void*)schema->metadata); } if (metadata != NULL) { size_t metadata_size = ArrowMetadataSizeOf(metadata); schema->metadata = (const char*)ArrowMalloc(metadata_size); if (schema->metadata == NULL) { return ENOMEM; } memcpy((void*)schema->metadata, metadata, metadata_size); } else { schema->metadata = NULL; } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema, int64_t n_children) { if (schema->children != NULL) { return EEXIST; } if (n_children > 0) { schema->children = (struct ArrowSchema**)ArrowMalloc(n_children * sizeof(struct ArrowSchema*)); if (schema->children == NULL) { return ENOMEM; } schema->n_children = n_children; memset(schema->children, 0, n_children * sizeof(struct ArrowSchema*)); for (int64_t i = 0; i < n_children; i++) { schema->children[i] = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); if (schema->children[i] == NULL) { return ENOMEM; } schema->children[i]->release = NULL; } } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema) { if (schema->dictionary != NULL) { return EEXIST; } schema->dictionary = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema)); if (schema->dictionary == NULL) { return ENOMEM; } schema->dictionary->release = NULL; return NANOARROW_OK; } ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema, struct ArrowSchema* schema_out) { ArrowSchemaInit(schema_out); int result = ArrowSchemaSetFormat(schema_out, schema->format); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } schema_out->flags = schema->flags; result = ArrowSchemaSetName(schema_out, schema->name); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaSetMetadata(schema_out, schema->metadata); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaAllocateChildren(schema_out, schema->n_children); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } for (int64_t i = 0; i < schema->n_children; i++) { result = ArrowSchemaDeepCopy(schema->children[i], schema_out->children[i]); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } } if (schema->dictionary != NULL) { result = ArrowSchemaAllocateDictionary(schema_out); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } result = ArrowSchemaDeepCopy(schema->dictionary, schema_out->dictionary); if (result != NANOARROW_OK) { ArrowSchemaRelease(schema_out); return result; } } return NANOARROW_OK; } static void ArrowSchemaViewSetPrimitive(struct ArrowSchemaView* schema_view, enum ArrowType type) { schema_view->type = type; schema_view->storage_type = type; } static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view, const char* format, const char** format_end_out, struct ArrowError* error) { *format_end_out = format; // needed for decimal parsing const char* parse_start; char* parse_end; switch (format[0]) { case 'n': schema_view->type = NANOARROW_TYPE_NA; schema_view->storage_type = NANOARROW_TYPE_NA; *format_end_out = format + 1; return NANOARROW_OK; case 'b': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BOOL); *format_end_out = format + 1; return NANOARROW_OK; case 'c': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT8); *format_end_out = format + 1; return NANOARROW_OK; case 'C': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT8); *format_end_out = format + 1; return NANOARROW_OK; case 's': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT16); *format_end_out = format + 1; return NANOARROW_OK; case 'S': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT16); *format_end_out = format + 1; return NANOARROW_OK; case 'i': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); *format_end_out = format + 1; return NANOARROW_OK; case 'I': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT32); *format_end_out = format + 1; return NANOARROW_OK; case 'l': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); *format_end_out = format + 1; return NANOARROW_OK; case 'L': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT64); *format_end_out = format + 1; return NANOARROW_OK; case 'e': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_HALF_FLOAT); *format_end_out = format + 1; return NANOARROW_OK; case 'f': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_FLOAT); *format_end_out = format + 1; return NANOARROW_OK; case 'g': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DOUBLE); *format_end_out = format + 1; return NANOARROW_OK; // decimal case 'd': if (format[1] != ':' || format[2] == '\0') { ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'"); return EINVAL; } parse_start = format + 2; schema_view->decimal_precision = (int32_t)strtol(parse_start, &parse_end, 10); if (parse_end == parse_start || parse_end[0] != ',') { ArrowErrorSet(error, "Expected 'precision,scale[,bitwidth]' following 'd:'"); return EINVAL; } parse_start = parse_end + 1; schema_view->decimal_scale = (int32_t)strtol(parse_start, &parse_end, 10); if (parse_end == parse_start) { ArrowErrorSet(error, "Expected 'scale[,bitwidth]' following 'd:precision,'"); return EINVAL; } else if (parse_end[0] != ',') { schema_view->decimal_bitwidth = 128; } else { parse_start = parse_end + 1; schema_view->decimal_bitwidth = (int32_t)strtol(parse_start, &parse_end, 10); if (parse_start == parse_end) { ArrowErrorSet(error, "Expected precision following 'd:precision,scale,'"); return EINVAL; } } *format_end_out = parse_end; switch (schema_view->decimal_bitwidth) { case 128: ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL128); return NANOARROW_OK; case 256: ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL256); return NANOARROW_OK; default: ArrowErrorSet(error, "Expected decimal bitwidth of 128 or 256 but found %d", (int)schema_view->decimal_bitwidth); return EINVAL; } // validity + data case 'w': schema_view->type = NANOARROW_TYPE_FIXED_SIZE_BINARY; schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_BINARY; if (format[1] != ':' || format[2] == '\0') { ArrowErrorSet(error, "Expected ':' following 'w'"); return EINVAL; } schema_view->fixed_size = (int32_t)strtol(format + 2, (char**)format_end_out, 10); return NANOARROW_OK; // validity + offset + data case 'z': schema_view->type = NANOARROW_TYPE_BINARY; schema_view->storage_type = NANOARROW_TYPE_BINARY; *format_end_out = format + 1; return NANOARROW_OK; case 'u': schema_view->type = NANOARROW_TYPE_STRING; schema_view->storage_type = NANOARROW_TYPE_STRING; *format_end_out = format + 1; return NANOARROW_OK; // validity + large_offset + data case 'Z': schema_view->type = NANOARROW_TYPE_LARGE_BINARY; schema_view->storage_type = NANOARROW_TYPE_LARGE_BINARY; *format_end_out = format + 1; return NANOARROW_OK; case 'U': schema_view->type = NANOARROW_TYPE_LARGE_STRING; schema_view->storage_type = NANOARROW_TYPE_LARGE_STRING; *format_end_out = format + 1; return NANOARROW_OK; // nested types case '+': switch (format[1]) { // list has validity + offset or offset case 'l': schema_view->storage_type = NANOARROW_TYPE_LIST; schema_view->type = NANOARROW_TYPE_LIST; *format_end_out = format + 2; return NANOARROW_OK; // large list has validity + large_offset or large_offset case 'L': schema_view->storage_type = NANOARROW_TYPE_LARGE_LIST; schema_view->type = NANOARROW_TYPE_LARGE_LIST; *format_end_out = format + 2; return NANOARROW_OK; // just validity buffer case 'w': if (format[2] != ':' || format[3] == '\0') { ArrowErrorSet(error, "Expected ':' following '+w'"); return EINVAL; } schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_LIST; schema_view->type = NANOARROW_TYPE_FIXED_SIZE_LIST; schema_view->fixed_size = (int32_t)strtol(format + 3, (char**)format_end_out, 10); return NANOARROW_OK; case 's': schema_view->storage_type = NANOARROW_TYPE_STRUCT; schema_view->type = NANOARROW_TYPE_STRUCT; *format_end_out = format + 2; return NANOARROW_OK; case 'm': schema_view->storage_type = NANOARROW_TYPE_MAP; schema_view->type = NANOARROW_TYPE_MAP; *format_end_out = format + 2; return NANOARROW_OK; // unions case 'u': switch (format[2]) { case 'd': schema_view->storage_type = NANOARROW_TYPE_DENSE_UNION; schema_view->type = NANOARROW_TYPE_DENSE_UNION; break; case 's': schema_view->storage_type = NANOARROW_TYPE_SPARSE_UNION; schema_view->type = NANOARROW_TYPE_SPARSE_UNION; break; default: ArrowErrorSet(error, "Expected union format string +us: or " "+ud: but found '%s'", format); return EINVAL; } if (format[3] == ':') { schema_view->union_type_ids = format + 4; int64_t n_type_ids = _ArrowParseUnionTypeIds(schema_view->union_type_ids, NULL); if (n_type_ids != schema_view->schema->n_children) { ArrowErrorSet( error, "Expected union type_ids parameter to be a comma-separated list of %ld " "values between 0 and 127 but found '%s'", (long)schema_view->schema->n_children, schema_view->union_type_ids); return EINVAL; } *format_end_out = format + strlen(format); return NANOARROW_OK; } else { ArrowErrorSet(error, "Expected union format string +us: or +ud: " "but found '%s'", format); return EINVAL; } default: ArrowErrorSet(error, "Expected nested type format string but found '%s'", format); return EINVAL; } // date/time types case 't': switch (format[1]) { // date case 'd': switch (format[2]) { case 'D': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); schema_view->type = NANOARROW_TYPE_DATE32; *format_end_out = format + 3; return NANOARROW_OK; case 'm': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_DATE64; *format_end_out = format + 3; return NANOARROW_OK; default: ArrowErrorSet(error, "Expected 'D' or 'm' following 'td' but found '%s'", format + 2); return EINVAL; } // time of day case 't': switch (format[2]) { case 's': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); schema_view->type = NANOARROW_TYPE_TIME32; schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; *format_end_out = format + 3; return NANOARROW_OK; case 'm': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32); schema_view->type = NANOARROW_TYPE_TIME32; schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; *format_end_out = format + 3; return NANOARROW_OK; case 'u': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIME64; schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; *format_end_out = format + 3; return NANOARROW_OK; case 'n': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIME64; schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; *format_end_out = format + 3; return NANOARROW_OK; default: ArrowErrorSet( error, "Expected 's', 'm', 'u', or 'n' following 'tt' but found '%s'", format + 2); return EINVAL; } // timestamp case 's': switch (format[2]) { case 's': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIMESTAMP; schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; break; case 'm': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIMESTAMP; schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; break; case 'u': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIMESTAMP; schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; break; case 'n': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_TIMESTAMP; schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; break; default: ArrowErrorSet( error, "Expected 's', 'm', 'u', or 'n' following 'ts' but found '%s'", format + 2); return EINVAL; } if (format[3] != ':') { ArrowErrorSet(error, "Expected ':' following '%.3s' but found '%s'", format, format + 3); return EINVAL; } schema_view->timezone = format + 4; *format_end_out = format + strlen(format); return NANOARROW_OK; // duration case 'D': switch (format[2]) { case 's': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_DURATION; schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND; *format_end_out = format + 3; return NANOARROW_OK; case 'm': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_DURATION; schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI; *format_end_out = format + 3; return NANOARROW_OK; case 'u': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_DURATION; schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO; *format_end_out = format + 3; return NANOARROW_OK; case 'n': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64); schema_view->type = NANOARROW_TYPE_DURATION; schema_view->time_unit = NANOARROW_TIME_UNIT_NANO; *format_end_out = format + 3; return NANOARROW_OK; default: ArrowErrorSet(error, "Expected 's', 'm', u', or 'n' following 'tD' but found '%s'", format + 2); return EINVAL; } // interval case 'i': switch (format[2]) { case 'M': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_MONTHS); *format_end_out = format + 3; return NANOARROW_OK; case 'D': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_DAY_TIME); *format_end_out = format + 3; return NANOARROW_OK; case 'n': ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO); *format_end_out = format + 3; return NANOARROW_OK; default: ArrowErrorSet(error, "Expected 'M', 'D', or 'n' following 'ti' but found '%s'", format + 2); return EINVAL; } default: ArrowErrorSet( error, "Expected 'd', 't', 's', 'D', or 'i' following 't' but found '%s'", format + 1); return EINVAL; } default: ArrowErrorSet(error, "Unknown format: '%s'", format); return EINVAL; } } static ArrowErrorCode ArrowSchemaViewValidateNChildren( struct ArrowSchemaView* schema_view, int64_t n_children, struct ArrowError* error) { if (n_children != -1 && schema_view->schema->n_children != n_children) { ArrowErrorSet(error, "Expected schema with %d children but found %d children", (int)n_children, (int)schema_view->schema->n_children); return EINVAL; } // Don't do a full validation of children but do check that they won't // segfault if inspected struct ArrowSchema* child; for (int64_t i = 0; i < schema_view->schema->n_children; i++) { child = schema_view->schema->children[i]; if (child == NULL) { ArrowErrorSet(error, "Expected valid schema at schema->children[%ld] but found NULL", (long)i); return EINVAL; } else if (child->release == NULL) { ArrowErrorSet( error, "Expected valid schema at schema->children[%ld] but found a released schema", (long)i); return EINVAL; } } return NANOARROW_OK; } static ArrowErrorCode ArrowSchemaViewValidateUnion(struct ArrowSchemaView* schema_view, struct ArrowError* error) { return ArrowSchemaViewValidateNChildren(schema_view, -1, error); } static ArrowErrorCode ArrowSchemaViewValidateMap(struct ArrowSchemaView* schema_view, struct ArrowError* error) { NANOARROW_RETURN_NOT_OK(ArrowSchemaViewValidateNChildren(schema_view, 1, error)); if (schema_view->schema->children[0]->n_children != 2) { ArrowErrorSet(error, "Expected child of map type to have 2 children but found %d", (int)schema_view->schema->children[0]->n_children); return EINVAL; } if (strcmp(schema_view->schema->children[0]->format, "+s") != 0) { ArrowErrorSet(error, "Expected format of child of map type to be '+s' but found '%s'", schema_view->schema->children[0]->format); return EINVAL; } if (schema_view->schema->children[0]->flags & ARROW_FLAG_NULLABLE) { ArrowErrorSet(error, "Expected child of map type to be non-nullable but was nullable"); return EINVAL; } if (schema_view->schema->children[0]->children[0]->flags & ARROW_FLAG_NULLABLE) { ArrowErrorSet(error, "Expected key of map type to be non-nullable but was nullable"); return EINVAL; } return NANOARROW_OK; } static ArrowErrorCode ArrowSchemaViewValidateDictionary( struct ArrowSchemaView* schema_view, struct ArrowError* error) { // check for valid index type switch (schema_view->storage_type) { case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_INT64: break; default: ArrowErrorSet( error, "Expected dictionary schema index type to be an integral type but found '%s'", schema_view->schema->format); return EINVAL; } struct ArrowSchemaView dictionary_schema_view; return ArrowSchemaViewInit(&dictionary_schema_view, schema_view->schema->dictionary, error); } static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_view, enum ArrowType type, struct ArrowError* error) { switch (type) { case NANOARROW_TYPE_NA: case NANOARROW_TYPE_BOOL: case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_HALF_FLOAT: case NANOARROW_TYPE_FLOAT: case NANOARROW_TYPE_DOUBLE: case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_DECIMAL256: case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_BINARY: case NANOARROW_TYPE_LARGE_BINARY: case NANOARROW_TYPE_DATE32: case NANOARROW_TYPE_DATE64: case NANOARROW_TYPE_INTERVAL_MONTHS: case NANOARROW_TYPE_INTERVAL_DAY_TIME: case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: case NANOARROW_TYPE_TIMESTAMP: case NANOARROW_TYPE_TIME32: case NANOARROW_TYPE_TIME64: case NANOARROW_TYPE_DURATION: return ArrowSchemaViewValidateNChildren(schema_view, 0, error); case NANOARROW_TYPE_FIXED_SIZE_BINARY: if (schema_view->fixed_size <= 0) { ArrowErrorSet(error, "Expected size > 0 for fixed size binary but found size %d", schema_view->fixed_size); return EINVAL; } return ArrowSchemaViewValidateNChildren(schema_view, 0, error); case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_LARGE_LIST: case NANOARROW_TYPE_FIXED_SIZE_LIST: return ArrowSchemaViewValidateNChildren(schema_view, 1, error); case NANOARROW_TYPE_STRUCT: return ArrowSchemaViewValidateNChildren(schema_view, -1, error); case NANOARROW_TYPE_SPARSE_UNION: case NANOARROW_TYPE_DENSE_UNION: return ArrowSchemaViewValidateUnion(schema_view, error); case NANOARROW_TYPE_MAP: return ArrowSchemaViewValidateMap(schema_view, error); case NANOARROW_TYPE_DICTIONARY: return ArrowSchemaViewValidateDictionary(schema_view, error); default: ArrowErrorSet(error, "Expected a valid enum ArrowType value but found %d", (int)schema_view->type); return EINVAL; } return NANOARROW_OK; } ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view, const struct ArrowSchema* schema, struct ArrowError* error) { if (schema == NULL) { ArrowErrorSet(error, "Expected non-NULL schema"); return EINVAL; } if (schema->release == NULL) { ArrowErrorSet(error, "Expected non-released schema"); return EINVAL; } schema_view->schema = schema; const char* format = schema->format; if (format == NULL) { ArrowErrorSet( error, "Error parsing schema->format: Expected a null-terminated string but found NULL"); return EINVAL; } size_t format_len = strlen(format); if (format_len == 0) { ArrowErrorSet(error, "Error parsing schema->format: Expected a string with size > 0"); return EINVAL; } const char* format_end_out; int result = ArrowSchemaViewParse(schema_view, format, &format_end_out, error); if (result != NANOARROW_OK) { if (error != NULL) { char child_error[1024]; memcpy(child_error, ArrowErrorMessage(error), 1024); ArrowErrorSet(error, "Error parsing schema->format: %s", child_error); } return result; } if ((format + format_len) != format_end_out) { ArrowErrorSet(error, "Error parsing schema->format '%s': parsed %d/%d characters", format, (int)(format_end_out - format), (int)(format_len)); return EINVAL; } if (schema->dictionary != NULL) { schema_view->type = NANOARROW_TYPE_DICTIONARY; } NANOARROW_RETURN_NOT_OK( ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error)); if (schema_view->storage_type != schema_view->type) { NANOARROW_RETURN_NOT_OK( ArrowSchemaViewValidate(schema_view, schema_view->type, error)); } int64_t unknown_flags = schema->flags & ~NANOARROW_FLAG_ALL_SUPPORTED; if (unknown_flags != 0) { ArrowErrorSet(error, "Unknown ArrowSchema flag"); return EINVAL; } if (schema->flags & ARROW_FLAG_DICTIONARY_ORDERED && schema_view->type != NANOARROW_TYPE_DICTIONARY) { ArrowErrorSet(error, "ARROW_FLAG_DICTIONARY_ORDERED is only relevant for dictionaries"); return EINVAL; } if (schema->flags & ARROW_FLAG_MAP_KEYS_SORTED && schema_view->type != NANOARROW_TYPE_MAP) { ArrowErrorSet(error, "ARROW_FLAG_MAP_KEYS_SORTED is only relevant for a map type"); return EINVAL; } ArrowLayoutInit(&schema_view->layout, schema_view->storage_type); if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_BINARY) { schema_view->layout.element_size_bits[1] = schema_view->fixed_size * 8; } else if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_LIST) { schema_view->layout.child_size_elements = schema_view->fixed_size; } schema_view->extension_name = ArrowCharView(NULL); schema_view->extension_metadata = ArrowCharView(NULL); NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:name"), &schema_view->extension_name)); NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata, ArrowCharView("ARROW:extension:metadata"), &schema_view->extension_metadata)); return NANOARROW_OK; } static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_view, char* out, int64_t n) { const char* type_string = ArrowTypeString(schema_view->type); switch (schema_view->type) { case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_DECIMAL256: return snprintf(out, n, "%s(%d, %d)", type_string, (int)schema_view->decimal_precision, (int)schema_view->decimal_scale); case NANOARROW_TYPE_TIMESTAMP: return snprintf(out, n, "%s('%s', '%s')", type_string, ArrowTimeUnitString(schema_view->time_unit), schema_view->timezone); case NANOARROW_TYPE_TIME32: case NANOARROW_TYPE_TIME64: case NANOARROW_TYPE_DURATION: return snprintf(out, n, "%s('%s')", type_string, ArrowTimeUnitString(schema_view->time_unit)); case NANOARROW_TYPE_FIXED_SIZE_BINARY: case NANOARROW_TYPE_FIXED_SIZE_LIST: return snprintf(out, n, "%s(%ld)", type_string, (long)schema_view->fixed_size); case NANOARROW_TYPE_SPARSE_UNION: case NANOARROW_TYPE_DENSE_UNION: return snprintf(out, n, "%s([%s])", type_string, schema_view->union_type_ids); default: return snprintf(out, n, "%s", type_string); } } // Helper for bookkeeping to emulate sprintf()-like behaviour spread // among multiple sprintf calls. static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last, int64_t* n_remaining, int64_t* n_chars) { *n_chars += n_chars_last; *n_remaining -= n_chars_last; // n_remaining is never less than 0 if (*n_remaining < 0) { *n_remaining = 0; } // Can't do math on a NULL pointer if (*out != NULL) { *out += n_chars_last; } } int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n, char recursive) { if (schema == NULL) { return snprintf(out, n, "[invalid: pointer is null]"); } if (schema->release == NULL) { return snprintf(out, n, "[invalid: schema is released]"); } struct ArrowSchemaView schema_view; struct ArrowError error; if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) { return snprintf(out, n, "[invalid: %s]", ArrowErrorMessage(&error)); } // Extension type and dictionary should include both the top-level type // and the storage type. int is_extension = schema_view.extension_name.size_bytes > 0; int is_dictionary = schema->dictionary != NULL; int64_t n_chars = 0; int64_t n_chars_last = 0; // Uncommon but not technically impossible that both are true if (is_extension && is_dictionary) { n_chars_last = snprintf( out, n, "%.*s{dictionary(%s)<", (int)schema_view.extension_name.size_bytes, schema_view.extension_name.data, ArrowTypeString(schema_view.storage_type)); } else if (is_extension) { n_chars_last = snprintf(out, n, "%.*s{", (int)schema_view.extension_name.size_bytes, schema_view.extension_name.data); } else if (is_dictionary) { n_chars_last = snprintf(out, n, "dictionary(%s)<", ArrowTypeString(schema_view.storage_type)); } ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); if (!is_dictionary) { n_chars_last = ArrowSchemaTypeToStringInternal(&schema_view, out, n); } else { n_chars_last = ArrowSchemaToString(schema->dictionary, out, n, recursive); } ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); if (recursive && schema->format[0] == '+') { n_chars_last = snprintf(out, n, "<"); ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); for (int64_t i = 0; i < schema->n_children; i++) { if (i > 0) { n_chars_last = snprintf(out, n, ", "); ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); } // ArrowSchemaToStringInternal() will validate the child and print the error, // but we need the name first if (schema->children[i] != NULL && schema->children[i]->release != NULL && schema->children[i]->name != NULL) { n_chars_last = snprintf(out, n, "%s: ", schema->children[i]->name); ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); } n_chars_last = ArrowSchemaToString(schema->children[i], out, n, recursive); ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); } n_chars_last = snprintf(out, n, ">"); ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars); } if (is_extension && is_dictionary) { n_chars += snprintf(out, n, ">}"); } else if (is_extension) { n_chars += snprintf(out, n, "}"); } else if (is_dictionary) { n_chars += snprintf(out, n, ">"); } return n_chars; } ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader, const char* metadata) { reader->metadata = metadata; if (reader->metadata == NULL) { reader->offset = 0; reader->remaining_keys = 0; } else { memcpy(&reader->remaining_keys, reader->metadata, sizeof(int32_t)); reader->offset = sizeof(int32_t); } return NANOARROW_OK; } ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader, struct ArrowStringView* key_out, struct ArrowStringView* value_out) { if (reader->remaining_keys <= 0) { return EINVAL; } int64_t pos = 0; int32_t key_size; memcpy(&key_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); pos += sizeof(int32_t); key_out->data = reader->metadata + reader->offset + pos; key_out->size_bytes = key_size; pos += key_size; int32_t value_size; memcpy(&value_size, reader->metadata + reader->offset + pos, sizeof(int32_t)); pos += sizeof(int32_t); value_out->data = reader->metadata + reader->offset + pos; value_out->size_bytes = value_size; pos += value_size; reader->offset += pos; reader->remaining_keys--; return NANOARROW_OK; } int64_t ArrowMetadataSizeOf(const char* metadata) { if (metadata == NULL) { return 0; } struct ArrowMetadataReader reader; struct ArrowStringView key; struct ArrowStringView value; if (ArrowMetadataReaderInit(&reader, metadata) != NANOARROW_OK) { return 0; } int64_t size = sizeof(int32_t); while (ArrowMetadataReaderRead(&reader, &key, &value) == NANOARROW_OK) { size += sizeof(int32_t) + key.size_bytes + sizeof(int32_t) + value.size_bytes; } return size; } static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata, struct ArrowStringView* key, struct ArrowStringView* value_out) { struct ArrowMetadataReader reader; struct ArrowStringView existing_key; struct ArrowStringView existing_value; NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata)); while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) == NANOARROW_OK) { int key_equal = key->size_bytes == existing_key.size_bytes && strncmp(key->data, existing_key.data, existing_key.size_bytes) == 0; if (key_equal) { value_out->data = existing_value.data; value_out->size_bytes = existing_value.size_bytes; break; } } return NANOARROW_OK; } ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key, struct ArrowStringView* value_out) { if (value_out == NULL) { return EINVAL; } return ArrowMetadataGetValueInternal(metadata, &key, value_out); } char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) { struct ArrowStringView value = ArrowCharView(NULL); if (ArrowMetadataGetValue(metadata, key, &value) != NANOARROW_OK) { return 0; } return value.data != NULL; } ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const char* metadata) { ArrowBufferInit(buffer); return ArrowBufferAppend(buffer, metadata, ArrowMetadataSizeOf(metadata)); } static ArrowErrorCode ArrowMetadataBuilderAppendInternal(struct ArrowBuffer* buffer, struct ArrowStringView* key, struct ArrowStringView* value) { if (value == NULL) { return NANOARROW_OK; } if (buffer->capacity_bytes == 0) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(buffer, 0)); } if (((size_t)buffer->capacity_bytes) < sizeof(int32_t)) { return EINVAL; } int32_t n_keys; memcpy(&n_keys, buffer->data, sizeof(int32_t)); int32_t key_size = (int32_t)key->size_bytes; int32_t value_size = (int32_t)value->size_bytes; NANOARROW_RETURN_NOT_OK(ArrowBufferReserve( buffer, sizeof(int32_t) + key_size + sizeof(int32_t) + value_size)); ArrowBufferAppendUnsafe(buffer, &key_size, sizeof(int32_t)); ArrowBufferAppendUnsafe(buffer, key->data, key_size); ArrowBufferAppendUnsafe(buffer, &value_size, sizeof(int32_t)); ArrowBufferAppendUnsafe(buffer, value->data, value_size); n_keys++; memcpy(buffer->data, &n_keys, sizeof(int32_t)); return NANOARROW_OK; } static ArrowErrorCode ArrowMetadataBuilderSetInternal(struct ArrowBuffer* buffer, struct ArrowStringView* key, struct ArrowStringView* value) { // Inspect the current value to see if we can avoid copying the buffer struct ArrowStringView current_value = ArrowCharView(NULL); NANOARROW_RETURN_NOT_OK( ArrowMetadataGetValueInternal((const char*)buffer->data, key, ¤t_value)); // The key should be removed but no key exists if (value == NULL && current_value.data == NULL) { return NANOARROW_OK; } // The key/value can be appended because no key exists if (value != NULL && current_value.data == NULL) { return ArrowMetadataBuilderAppendInternal(buffer, key, value); } struct ArrowMetadataReader reader; struct ArrowStringView existing_key; struct ArrowStringView existing_value; NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, (const char*)buffer->data)); struct ArrowBuffer new_buffer; NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&new_buffer, NULL)); while (reader.remaining_keys > 0) { int result = ArrowMetadataReaderRead(&reader, &existing_key, &existing_value); if (result != NANOARROW_OK) { ArrowBufferReset(&new_buffer); return result; } if (key->size_bytes == existing_key.size_bytes && strncmp((const char*)key->data, (const char*)existing_key.data, existing_key.size_bytes) == 0) { result = ArrowMetadataBuilderAppendInternal(&new_buffer, key, value); value = NULL; } else { result = ArrowMetadataBuilderAppendInternal(&new_buffer, &existing_key, &existing_value); } if (result != NANOARROW_OK) { ArrowBufferReset(&new_buffer); return result; } } ArrowBufferReset(buffer); ArrowBufferMove(&new_buffer, buffer); return NANOARROW_OK; } ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer, struct ArrowStringView key, struct ArrowStringView value) { return ArrowMetadataBuilderAppendInternal(buffer, &key, &value); } ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer, struct ArrowStringView key, struct ArrowStringView value) { return ArrowMetadataBuilderSetInternal(buffer, &key, &value); } ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer, struct ArrowStringView key) { return ArrowMetadataBuilderSetInternal(buffer, &key, NULL); } // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include #include #include #include "nanoarrow.h" static void ArrowArrayReleaseInternal(struct ArrowArray* array) { // Release buffers held by this array struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; if (private_data != NULL) { ArrowBitmapReset(&private_data->bitmap); ArrowBufferReset(&private_data->buffers[0]); ArrowBufferReset(&private_data->buffers[1]); ArrowFree(private_data); } // This object owns the memory for all the children, but those // children may have been generated elsewhere and might have // their own release() callback. if (array->children != NULL) { for (int64_t i = 0; i < array->n_children; i++) { if (array->children[i] != NULL) { if (array->children[i]->release != NULL) { ArrowArrayRelease(array->children[i]); } ArrowFree(array->children[i]); } } ArrowFree(array->children); } // This object owns the memory for the dictionary but it // may have been generated somewhere else and have its own // release() callback. if (array->dictionary != NULL) { if (array->dictionary->release != NULL) { ArrowArrayRelease(array->dictionary); } ArrowFree(array->dictionary); } // Mark released array->release = NULL; } static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, enum ArrowType storage_type) { switch (storage_type) { case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: array->n_buffers = 0; break; case NANOARROW_TYPE_FIXED_SIZE_LIST: case NANOARROW_TYPE_STRUCT: case NANOARROW_TYPE_SPARSE_UNION: array->n_buffers = 1; break; case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_LARGE_LIST: case NANOARROW_TYPE_MAP: case NANOARROW_TYPE_BOOL: case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT16: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT32: case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_HALF_FLOAT: case NANOARROW_TYPE_FLOAT: case NANOARROW_TYPE_DOUBLE: case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_DECIMAL256: case NANOARROW_TYPE_INTERVAL_MONTHS: case NANOARROW_TYPE_INTERVAL_DAY_TIME: case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: case NANOARROW_TYPE_FIXED_SIZE_BINARY: case NANOARROW_TYPE_DENSE_UNION: array->n_buffers = 2; break; case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_BINARY: case NANOARROW_TYPE_LARGE_BINARY: array->n_buffers = 3; break; default: return EINVAL; return NANOARROW_OK; } struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; private_data->storage_type = storage_type; return NANOARROW_OK; } ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, enum ArrowType storage_type) { array->length = 0; array->null_count = 0; array->offset = 0; array->n_buffers = 0; array->n_children = 0; array->buffers = NULL; array->children = NULL; array->dictionary = NULL; array->release = &ArrowArrayReleaseInternal; array->private_data = NULL; struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)ArrowMalloc(sizeof(struct ArrowArrayPrivateData)); if (private_data == NULL) { array->release = NULL; return ENOMEM; } ArrowBitmapInit(&private_data->bitmap); ArrowBufferInit(&private_data->buffers[0]); ArrowBufferInit(&private_data->buffers[1]); private_data->buffer_data[0] = NULL; private_data->buffer_data[1] = NULL; private_data->buffer_data[2] = NULL; array->private_data = private_data; array->buffers = (const void**)(&private_data->buffer_data); int result = ArrowArraySetStorageType(array, storage_type); if (result != NANOARROW_OK) { ArrowArrayRelease(array); return result; } ArrowLayoutInit(&private_data->layout, storage_type); // We can only know this not to be true when initializing based on a schema // so assume this to be true. private_data->union_type_id_is_child_index = 1; return NANOARROW_OK; } ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array, const struct ArrowArrayView* array_view, struct ArrowError* error) { NANOARROW_RETURN_NOT_OK_WITH_ERROR( ArrowArrayInitFromType(array, array_view->storage_type), error); int result; struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; private_data->layout = array_view->layout; if (array_view->n_children > 0) { result = ArrowArrayAllocateChildren(array, array_view->n_children); if (result != NANOARROW_OK) { ArrowArrayRelease(array); return result; } for (int64_t i = 0; i < array_view->n_children; i++) { result = ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error); if (result != NANOARROW_OK) { ArrowArrayRelease(array); return result; } } } if (array_view->dictionary != NULL) { result = ArrowArrayAllocateDictionary(array); if (result != NANOARROW_OK) { ArrowArrayRelease(array); return result; } result = ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error); if (result != NANOARROW_OK) { ArrowArrayRelease(array); return result; } } return NANOARROW_OK; } ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array, const struct ArrowSchema* schema, struct ArrowError* error) { struct ArrowArrayView array_view; NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error)); NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(array, &array_view, error)); if (array_view.storage_type == NANOARROW_TYPE_DENSE_UNION || array_view.storage_type == NANOARROW_TYPE_SPARSE_UNION) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; // We can still build arrays if this isn't true; however, the append // functions won't work. Instead, we store this value and error only // when StartAppending is called. private_data->union_type_id_is_child_index = _ArrowUnionTypeIdsWillEqualChildIndices(schema->format + 4, schema->n_children); } ArrowArrayViewReset(&array_view); return NANOARROW_OK; } ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children) { if (array->children != NULL) { return EINVAL; } if (n_children == 0) { return NANOARROW_OK; } array->children = (struct ArrowArray**)ArrowMalloc(n_children * sizeof(struct ArrowArray*)); if (array->children == NULL) { return ENOMEM; } memset(array->children, 0, n_children * sizeof(struct ArrowArray*)); for (int64_t i = 0; i < n_children; i++) { array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); if (array->children[i] == NULL) { return ENOMEM; } array->children[i]->release = NULL; } array->n_children = n_children; return NANOARROW_OK; } ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array) { if (array->dictionary != NULL) { return EINVAL; } array->dictionary = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray)); if (array->dictionary == NULL) { return ENOMEM; } array->dictionary->release = NULL; return NANOARROW_OK; } void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; ArrowBufferMove(&bitmap->buffer, &private_data->bitmap.buffer); private_data->bitmap.size_bits = bitmap->size_bits; bitmap->size_bits = 0; private_data->buffer_data[0] = private_data->bitmap.buffer.data; array->null_count = -1; } ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i, struct ArrowBuffer* buffer) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; switch (i) { case 0: ArrowBufferMove(buffer, &private_data->bitmap.buffer); private_data->buffer_data[i] = private_data->bitmap.buffer.data; break; case 1: case 2: ArrowBufferMove(buffer, &private_data->buffers[i - 1]); private_data->buffer_data[i] = private_data->buffers[i - 1].data; break; default: return EINVAL; } return NANOARROW_OK; } static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_view, struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; ArrowArrayViewInitFromType(array_view, private_data->storage_type); array_view->layout = private_data->layout; array_view->array = array; array_view->length = array->length; array_view->offset = array->offset; array_view->null_count = array->null_count; array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data; array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes; array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data; array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes; array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data; array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes; int result = ArrowArrayViewAllocateChildren(array_view, array->n_children); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } for (int64_t i = 0; i < array->n_children; i++) { result = ArrowArrayViewInitFromArray(array_view->children[i], array->children[i]); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } } if (array->dictionary != NULL) { result = ArrowArrayViewAllocateDictionary(array_view); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } } return NANOARROW_OK; } static ArrowErrorCode ArrowArrayReserveInternal(struct ArrowArray* array, struct ArrowArrayView* array_view) { // Loop through buffers and reserve the extra space that we know about for (int64_t i = 0; i < array->n_buffers; i++) { // Don't reserve on a validity buffer that hasn't been allocated yet if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && ArrowArrayBuffer(array, i)->data == NULL) { continue; } int64_t additional_size_bytes = array_view->buffer_views[i].size_bytes - ArrowArrayBuffer(array, i)->size_bytes; if (additional_size_bytes > 0) { NANOARROW_RETURN_NOT_OK( ArrowBufferReserve(ArrowArrayBuffer(array, i), additional_size_bytes)); } } // Recursively reserve children for (int64_t i = 0; i < array->n_children; i++) { NANOARROW_RETURN_NOT_OK( ArrowArrayReserveInternal(array->children[i], array_view->children[i])); } return NANOARROW_OK; } ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array, int64_t additional_size_elements) { struct ArrowArrayView array_view; NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array)); // Calculate theoretical buffer sizes (recursively) ArrowArrayViewSetLength(&array_view, array->length + additional_size_elements); // Walk the structure (recursively) int result = ArrowArrayReserveInternal(array, &array_view); ArrowArrayViewReset(&array_view); if (result != NANOARROW_OK) { return result; } return NANOARROW_OK; } static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; // The only buffer finalizing this currently does is make sure the data // buffer for (Large)String|Binary is never NULL switch (private_data->storage_type) { case NANOARROW_TYPE_BINARY: case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_BINARY: case NANOARROW_TYPE_LARGE_STRING: if (ArrowArrayBuffer(array, 2)->data == NULL) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 2), 0)); } break; default: break; } for (int64_t i = 0; i < array->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i])); } if (array->dictionary != NULL) { NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); } return NANOARROW_OK; } static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; } for (int64_t i = 0; i < array->n_children; i++) { ArrowArrayFlushInternalPointers(array->children[i]); } if (array->dictionary != NULL) { ArrowArrayFlushInternalPointers(array->dictionary); } } ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array, enum ArrowValidationLevel validation_level, struct ArrowError* error) { // Even if the data buffer is size zero, the pointer value needed to be non-null // in some implementations (at least one version of Arrow C++ at the time this // was added). Only do this fix if we can assume CPU data access. if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) { NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error); } // Make sure the value we get with array->buffers[i] is set to the actual // pointer (which may have changed from the original due to reallocation) ArrowArrayFlushInternalPointers(array); if (validation_level == NANOARROW_VALIDATION_LEVEL_NONE) { return NANOARROW_OK; } // For validation, initialize an ArrowArrayView with our known buffer sizes struct ArrowArrayView array_view; NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array), error); int result = ArrowArrayViewValidate(&array_view, validation_level, error); ArrowArrayViewReset(&array_view); return result; } ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, struct ArrowError* error) { return ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_DEFAULT, error); } void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, enum ArrowType storage_type) { memset(array_view, 0, sizeof(struct ArrowArrayView)); array_view->storage_type = storage_type; ArrowLayoutInit(&array_view->layout, storage_type); } ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view, int64_t n_children) { if (array_view->children != NULL) { return EINVAL; } array_view->children = (struct ArrowArrayView**)ArrowMalloc(n_children * sizeof(struct ArrowArrayView*)); if (array_view->children == NULL) { return ENOMEM; } for (int64_t i = 0; i < n_children; i++) { array_view->children[i] = NULL; } array_view->n_children = n_children; for (int64_t i = 0; i < n_children; i++) { array_view->children[i] = (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); if (array_view->children[i] == NULL) { return ENOMEM; } ArrowArrayViewInitFromType(array_view->children[i], NANOARROW_TYPE_UNINITIALIZED); } return NANOARROW_OK; } ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) { if (array_view->dictionary != NULL) { return EINVAL; } array_view->dictionary = (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); if (array_view->dictionary == NULL) { return ENOMEM; } ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED); return NANOARROW_OK; } ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view, const struct ArrowSchema* schema, struct ArrowError* error) { struct ArrowSchemaView schema_view; int result = ArrowSchemaViewInit(&schema_view, schema, error); if (result != NANOARROW_OK) { return result; } ArrowArrayViewInitFromType(array_view, schema_view.storage_type); array_view->layout = schema_view.layout; result = ArrowArrayViewAllocateChildren(array_view, schema->n_children); if (result != NANOARROW_OK) { ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed"); ArrowArrayViewReset(array_view); return result; } for (int64_t i = 0; i < schema->n_children; i++) { result = ArrowArrayViewInitFromSchema(array_view->children[i], schema->children[i], error); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } } if (schema->dictionary != NULL) { result = ArrowArrayViewAllocateDictionary(array_view); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } result = ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error); if (result != NANOARROW_OK) { ArrowArrayViewReset(array_view); return result; } } if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION || array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) { array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t)); if (array_view->union_type_id_map == NULL) { return ENOMEM; } memset(array_view->union_type_id_map, -1, 256); int32_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids, array_view->union_type_id_map + 128); for (int8_t child_index = 0; child_index < n_type_ids; child_index++) { int8_t type_id = array_view->union_type_id_map[128 + child_index]; array_view->union_type_id_map[type_id] = child_index; } } return NANOARROW_OK; } void ArrowArrayViewReset(struct ArrowArrayView* array_view) { if (array_view->children != NULL) { for (int64_t i = 0; i < array_view->n_children; i++) { if (array_view->children[i] != NULL) { ArrowArrayViewReset(array_view->children[i]); ArrowFree(array_view->children[i]); } } ArrowFree(array_view->children); } if (array_view->dictionary != NULL) { ArrowArrayViewReset(array_view->dictionary); ArrowFree(array_view->dictionary); } if (array_view->union_type_id_map != NULL) { ArrowFree(array_view->union_type_id_map); } ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED); } void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_VALIDITY: array_view->buffer_views[i].size_bytes = _ArrowBytesForBits(length); continue; case NANOARROW_BUFFER_TYPE_DATA_OFFSET: // Probably don't want/need to rely on the producer to have allocated an // offsets buffer of length 1 for a zero-size array array_view->buffer_views[i].size_bytes = (length != 0) * element_size_bytes * (length + 1); continue; case NANOARROW_BUFFER_TYPE_DATA: array_view->buffer_views[i].size_bytes = _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * length) / 8; continue; case NANOARROW_BUFFER_TYPE_TYPE_ID: case NANOARROW_BUFFER_TYPE_UNION_OFFSET: array_view->buffer_views[i].size_bytes = element_size_bytes * length; continue; case NANOARROW_BUFFER_TYPE_NONE: array_view->buffer_views[i].size_bytes = 0; continue; } } switch (array_view->storage_type) { case NANOARROW_TYPE_STRUCT: case NANOARROW_TYPE_SPARSE_UNION: for (int64_t i = 0; i < array_view->n_children; i++) { ArrowArrayViewSetLength(array_view->children[i], length); } break; case NANOARROW_TYPE_FIXED_SIZE_LIST: if (array_view->n_children >= 1) { ArrowArrayViewSetLength(array_view->children[0], length * array_view->layout.child_size_elements); } default: break; } } // This version recursively extracts information from the array and stores it // in the array view, performing any checks that require the original array. static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, const struct ArrowArray* array, struct ArrowError* error) { array_view->array = array; array_view->offset = array->offset; array_view->length = array->length; array_view->null_count = array->null_count; int64_t buffers_required = 0; for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } buffers_required++; // Set buffer pointer array_view->buffer_views[i].data.data = array->buffers[i]; // If non-null, set buffer size to unknown. if (array->buffers[i] == NULL) { array_view->buffer_views[i].size_bytes = 0; } else { array_view->buffer_views[i].size_bytes = -1; } } // Check the number of buffers if (buffers_required != array->n_buffers) { ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)", (int)buffers_required, (int)array->n_buffers); return EINVAL; } // Check number of children if (array_view->n_children != array->n_children) { ArrowErrorSet(error, "Expected %ld children but found %ld children", (long)array_view->n_children, (long)array->n_children); return EINVAL; } // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i], array->children[i], error)); } // Check dictionary if (array->dictionary == NULL && array_view->dictionary != NULL) { ArrowErrorSet(error, "Expected dictionary but found NULL"); return EINVAL; } if (array->dictionary != NULL && array_view->dictionary == NULL) { ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member"); return EINVAL; } if (array->dictionary != NULL) { NANOARROW_RETURN_NOT_OK( ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error)); } return NANOARROW_OK; } static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, struct ArrowError* error) { if (array_view->length < 0) { ArrowErrorSet(error, "Expected length >= 0 but found length %ld", (long)array_view->length); return EINVAL; } if (array_view->offset < 0) { ArrowErrorSet(error, "Expected offset >= 0 but found offset %ld", (long)array_view->offset); return EINVAL; } // Calculate buffer sizes that do not require buffer access. If marked as // unknown, assign the buffer size; otherwise, validate it. int64_t offset_plus_length = array_view->offset + array_view->length; // Only loop over the first two buffers because the size of the third buffer // is always data dependent for all current Arrow types. for (int i = 0; i < 2; i++) { int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; // Initialize with a value that will cause an error if accidentally used uninitialized int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1; switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_VALIDITY: if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) { continue; } min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length); break; case NANOARROW_BUFFER_TYPE_DATA_OFFSET: // Probably don't want/need to rely on the producer to have allocated an // offsets buffer of length 1 for a zero-size array min_buffer_size_bytes = (offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1); break; case NANOARROW_BUFFER_TYPE_DATA: min_buffer_size_bytes = _ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * offset_plus_length) / 8; break; case NANOARROW_BUFFER_TYPE_TYPE_ID: case NANOARROW_BUFFER_TYPE_UNION_OFFSET: min_buffer_size_bytes = element_size_bytes * offset_plus_length; break; case NANOARROW_BUFFER_TYPE_NONE: continue; } // Assign or validate buffer size if (array_view->buffer_views[i].size_bytes == -1) { array_view->buffer_views[i].size_bytes = min_buffer_size_bytes; } else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) { ArrowErrorSet(error, "Expected %s array buffer %d to have size >= %ld bytes but found " "buffer with %ld bytes", ArrowTypeString(array_view->storage_type), (int)i, (long)min_buffer_size_bytes, (long)array_view->buffer_views[i].size_bytes); return EINVAL; } } // For list, fixed-size list and map views, we can validate the number of children switch (array_view->storage_type) { case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_LARGE_LIST: case NANOARROW_TYPE_FIXED_SIZE_LIST: case NANOARROW_TYPE_MAP: if (array_view->n_children != 1) { ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays", ArrowTypeString(array_view->storage_type), (long)array_view->n_children); return EINVAL; } default: break; } // For struct, the sparse union, and the fixed-size list views, we can validate child // lengths. int64_t child_min_length; switch (array_view->storage_type) { case NANOARROW_TYPE_SPARSE_UNION: case NANOARROW_TYPE_STRUCT: child_min_length = (array_view->offset + array_view->length); for (int64_t i = 0; i < array_view->n_children; i++) { if (array_view->children[i]->length < child_min_length) { ArrowErrorSet( error, "Expected struct child %d to have length >= %ld but found child with " "length %ld", (int)(i + 1), (long)(child_min_length), (long)array_view->children[i]->length); return EINVAL; } } break; case NANOARROW_TYPE_FIXED_SIZE_LIST: child_min_length = (array_view->offset + array_view->length) * array_view->layout.child_size_elements; if (array_view->children[0]->length < child_min_length) { ArrowErrorSet(error, "Expected child of fixed_size_list array to have length >= %ld but " "found array with length %ld", (long)child_min_length, (long)array_view->children[0]->length); return EINVAL; } break; default: break; } // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK( ArrowArrayViewValidateMinimal(array_view->children[i], error)); } // Recurse for dictionary if (array_view->dictionary != NULL) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error)); } return NANOARROW_OK; } static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view, struct ArrowError* error) { // Perform minimal validation. This will validate or assign // buffer sizes as long as buffer access is not required. NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); // Calculate buffer sizes or child lengths that require accessing the offsets // buffer. Where appropriate, validate that the first offset is >= 0. // If a buffer size is marked as unknown, assign it; otherwise, validate it. int64_t offset_plus_length = array_view->offset + array_view->length; int64_t first_offset; int64_t last_offset; switch (array_view->storage_type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int32[0]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", (long)first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; // If the data buffer size is unknown, assign it; otherwise, check it if (array_view->buffer_views[2].size_bytes == -1) { array_view->buffer_views[2].size_bytes = last_offset; } else if (array_view->buffer_views[2].size_bytes < last_offset) { ArrowErrorSet(error, "Expected %s array buffer 2 to have size >= %ld bytes but found " "buffer with %ld bytes", ArrowTypeString(array_view->storage_type), (long)last_offset, (long)array_view->buffer_views[2].size_bytes); return EINVAL; } } break; case NANOARROW_TYPE_LARGE_STRING: case NANOARROW_TYPE_LARGE_BINARY: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int64[0]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", (long)first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; // If the data buffer size is unknown, assign it; otherwise, check it if (array_view->buffer_views[2].size_bytes == -1) { array_view->buffer_views[2].size_bytes = last_offset; } else if (array_view->buffer_views[2].size_bytes < last_offset) { ArrowErrorSet(error, "Expected %s array buffer 2 to have size >= %ld bytes but found " "buffer with %ld bytes", ArrowTypeString(array_view->storage_type), (long)last_offset, (long)array_view->buffer_views[2].size_bytes); return EINVAL; } } break; case NANOARROW_TYPE_STRUCT: for (int64_t i = 0; i < array_view->n_children; i++) { if (array_view->children[i]->length < offset_plus_length) { ArrowErrorSet( error, "Expected struct child %d to have length >= %ld but found child with " "length %ld", (int)(i + 1), (long)offset_plus_length, (long)array_view->children[i]->length); return EINVAL; } } break; case NANOARROW_TYPE_LIST: case NANOARROW_TYPE_MAP: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int32[0]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", (long)first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length]; if (array_view->children[0]->length < last_offset) { ArrowErrorSet( error, "Expected child of %s array to have length >= %ld but found array with " "length %ld", ArrowTypeString(array_view->storage_type), (long)last_offset, (long)array_view->children[0]->length); return EINVAL; } } break; case NANOARROW_TYPE_LARGE_LIST: if (array_view->buffer_views[1].size_bytes != 0) { first_offset = array_view->buffer_views[1].data.as_int64[0]; if (first_offset < 0) { ArrowErrorSet(error, "Expected first offset >= 0 but found %ld", (long)first_offset); return EINVAL; } last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length]; if (array_view->children[0]->length < last_offset) { ArrowErrorSet( error, "Expected child of large list array to have length >= %ld but found array " "with length %ld", (long)last_offset, (long)array_view->children[0]->length); return EINVAL; } } break; default: break; } // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK( ArrowArrayViewValidateDefault(array_view->children[i], error)); } // Recurse for dictionary if (array_view->dictionary != NULL) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error)); } return NANOARROW_OK; } ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view, const struct ArrowArray* array, struct ArrowError* error) { // Extract information from the array into the array view NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); // Run default validation. Because we've marked all non-NULL buffers as having unknown // size, validation will also update the buffer sizes as it goes. NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); return NANOARROW_OK; } ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view, const struct ArrowArray* array, struct ArrowError* error) { // Extract information from the array into the array view NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error)); // Run default validation. Because we've marked all non-NULL buffers as having unknown // size, validation will also update the buffer sizes as it goes. NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error)); return NANOARROW_OK; } static int ArrowAssertIncreasingInt32(struct ArrowBufferView view, struct ArrowError* error) { if (view.size_bytes <= (int64_t)sizeof(int32_t)) { return NANOARROW_OK; } for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) { if (view.data.as_int32[i] < view.data.as_int32[i - 1]) { ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); return EINVAL; } } return NANOARROW_OK; } static int ArrowAssertIncreasingInt64(struct ArrowBufferView view, struct ArrowError* error) { if (view.size_bytes <= (int64_t)sizeof(int64_t)) { return NANOARROW_OK; } for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) { if (view.data.as_int64[i] < view.data.as_int64[i - 1]) { ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i); return EINVAL; } } return NANOARROW_OK; } static int ArrowAssertRangeInt8(struct ArrowBufferView view, int8_t min_value, int8_t max_value, struct ArrowError* error) { for (int64_t i = 0; i < view.size_bytes; i++) { if (view.data.as_int8[i] < min_value || view.data.as_int8[i] > max_value) { ArrowErrorSet(error, "[%ld] Expected buffer value between %d and %d but found value %d", (long)i, (int)min_value, (int)max_value, (int)view.data.as_int8[i]); return EINVAL; } } return NANOARROW_OK; } static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, int64_t n_values, struct ArrowError* error) { for (int64_t i = 0; i < view.size_bytes; i++) { int item_found = 0; for (int64_t j = 0; j < n_values; j++) { if (view.data.as_int8[i] == values[j]) { item_found = 1; break; } } if (!item_found) { ArrowErrorSet(error, "[%ld] Unexpected buffer value %d", (long)i, (int)view.data.as_int8[i]); return EINVAL; } } return NANOARROW_OK; } static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, struct ArrowError* error) { for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_DATA_OFFSET: if (array_view->layout.element_size_bits[i] == 32) { NANOARROW_RETURN_NOT_OK( ArrowAssertIncreasingInt32(array_view->buffer_views[i], error)); } else { NANOARROW_RETURN_NOT_OK( ArrowAssertIncreasingInt64(array_view->buffer_views[i], error)); } break; default: break; } } if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION || array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) { if (array_view->union_type_id_map == NULL) { // If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() + // ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough // information to validate this buffer. ArrowErrorSet(error, "Insufficient information provided for validation of union array"); return EINVAL; } else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices( array_view->union_type_id_map, array_view->n_children, array_view->n_children)) { NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8( array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error)); } else { NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0], array_view->union_type_id_map + 128, array_view->n_children, error)); } } if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION && array_view->union_type_id_map != NULL) { // Check that offsets refer to child elements that actually exist for (int64_t i = 0; i < array_view->length; i++) { int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i); int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i); int64_t child_length = array_view->children[child_id]->length; if (offset < 0 || offset > child_length) { ArrowErrorSet( error, "[%ld] Expected union offset for child id %d to be between 0 and %ld but " "found offset value %ld", (long)i, (int)child_id, (long)child_length, (long)offset); return EINVAL; } } } // Recurse for children for (int64_t i = 0; i < array_view->n_children; i++) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); } // Dictionary valiation not implemented if (array_view->dictionary != NULL) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary, error)); // TODO: validate the indices } return NANOARROW_OK; } ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view, enum ArrowValidationLevel validation_level, struct ArrowError* error) { switch (validation_level) { case NANOARROW_VALIDATION_LEVEL_NONE: return NANOARROW_OK; case NANOARROW_VALIDATION_LEVEL_MINIMAL: return ArrowArrayViewValidateMinimal(array_view, error); case NANOARROW_VALIDATION_LEVEL_DEFAULT: return ArrowArrayViewValidateDefault(array_view, error); case NANOARROW_VALIDATION_LEVEL_FULL: NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error)); return ArrowArrayViewValidateFull(array_view, error); } ArrowErrorSet(error, "validation_level not recognized"); return EINVAL; } // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include #include "nanoarrow.h" struct BasicArrayStreamPrivate { struct ArrowSchema schema; int64_t n_arrays; struct ArrowArray* arrays; int64_t arrays_i; }; static int ArrowBasicArrayStreamGetSchema(struct ArrowArrayStream* array_stream, struct ArrowSchema* schema) { if (array_stream == NULL || array_stream->release == NULL) { return EINVAL; } struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)array_stream->private_data; return ArrowSchemaDeepCopy(&private_data->schema, schema); } static int ArrowBasicArrayStreamGetNext(struct ArrowArrayStream* array_stream, struct ArrowArray* array) { if (array_stream == NULL || array_stream->release == NULL) { return EINVAL; } struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)array_stream->private_data; if (private_data->arrays_i == private_data->n_arrays) { array->release = NULL; return NANOARROW_OK; } ArrowArrayMove(&private_data->arrays[private_data->arrays_i++], array); return NANOARROW_OK; } static const char* ArrowBasicArrayStreamGetLastError( struct ArrowArrayStream* array_stream) { NANOARROW_UNUSED(array_stream); return NULL; } static void ArrowBasicArrayStreamRelease(struct ArrowArrayStream* array_stream) { if (array_stream == NULL || array_stream->release == NULL) { return; } struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)array_stream->private_data; if (private_data->schema.release != NULL) { ArrowSchemaRelease(&private_data->schema); } for (int64_t i = 0; i < private_data->n_arrays; i++) { if (private_data->arrays[i].release != NULL) { ArrowArrayRelease(&private_data->arrays[i]); } } if (private_data->arrays != NULL) { ArrowFree(private_data->arrays); } ArrowFree(private_data); array_stream->release = NULL; } ArrowErrorCode ArrowBasicArrayStreamInit(struct ArrowArrayStream* array_stream, struct ArrowSchema* schema, int64_t n_arrays) { struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)ArrowMalloc( sizeof(struct BasicArrayStreamPrivate)); if (private_data == NULL) { return ENOMEM; } ArrowSchemaMove(schema, &private_data->schema); private_data->n_arrays = n_arrays; private_data->arrays = NULL; private_data->arrays_i = 0; if (n_arrays > 0) { private_data->arrays = (struct ArrowArray*)ArrowMalloc(n_arrays * sizeof(struct ArrowArray)); if (private_data->arrays == NULL) { ArrowBasicArrayStreamRelease(array_stream); return ENOMEM; } } for (int64_t i = 0; i < private_data->n_arrays; i++) { private_data->arrays[i].release = NULL; } array_stream->get_schema = &ArrowBasicArrayStreamGetSchema; array_stream->get_next = &ArrowBasicArrayStreamGetNext; array_stream->get_last_error = ArrowBasicArrayStreamGetLastError; array_stream->release = ArrowBasicArrayStreamRelease; array_stream->private_data = private_data; return NANOARROW_OK; } void ArrowBasicArrayStreamSetArray(struct ArrowArrayStream* array_stream, int64_t i, struct ArrowArray* array) { struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)array_stream->private_data; ArrowArrayMove(array, &private_data->arrays[i]); } ArrowErrorCode ArrowBasicArrayStreamValidate(const struct ArrowArrayStream* array_stream, struct ArrowError* error) { struct BasicArrayStreamPrivate* private_data = (struct BasicArrayStreamPrivate*)array_stream->private_data; struct ArrowArrayView array_view; NANOARROW_RETURN_NOT_OK( ArrowArrayViewInitFromSchema(&array_view, &private_data->schema, error)); for (int64_t i = 0; i < private_data->n_arrays; i++) { if (private_data->arrays[i].release != NULL) { int result = ArrowArrayViewSetArray(&array_view, &private_data->arrays[i], error); if (result != NANOARROW_OK) { ArrowArrayViewReset(&array_view); return result; } } } ArrowArrayViewReset(&array_view); return NANOARROW_OK; } nanoarrow/src/array_view.c0000644000176200001440000000446014547575511015405 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "nanoarrow.h" #include "array.h" #include "schema.h" #include "util.h" static void finalize_array_view_xptr(SEXP array_view_xptr) { struct ArrowArrayView* array_view = (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr); if (array_view != NULL) { ArrowArrayViewReset(array_view); ArrowFree(array_view); } } SEXP nanoarrow_c_array_view(SEXP array_xptr, SEXP schema_xptr) { struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); struct ArrowError error; ArrowErrorInit(&error); struct ArrowArrayView* array_view = (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView)); ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED); SEXP xptr = PROTECT(R_MakeExternalPtr(array_view, R_NilValue, array_xptr)); R_RegisterCFinalizer(xptr, &finalize_array_view_xptr); int result = ArrowArrayViewInitFromSchema(array_view, schema, &error); if (result != NANOARROW_OK) { Rf_error(" %s", error.message); } result = ArrowArrayViewSetArray(array_view, array, &error); if (result != NANOARROW_OK) { Rf_error(" %s", error.message); } Rf_setAttrib(xptr, R_ClassSymbol, nanoarrow_cls_array_view); UNPROTECT(1); return xptr; } SEXP array_view_xptr_from_array_xptr(SEXP array_xptr) { return nanoarrow_c_array_view(array_xptr, R_ExternalPtrTag(array_xptr)); } nanoarrow/src/array.h0000644000176200001440000001654614547575511014370 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_NANOARROW_ARRAY_H_INCLUDED #define R_NANOARROW_ARRAY_H_INCLUDED #include #include #include #include "buffer.h" #include "nanoarrow.h" #include "util.h" // Returns an external pointer to an array child with a schema attached. // The returned pointer will keep its parent alive unless passed through // array_xptr_ensure_independent(). This is typically what you want when // printing or performing a conversion, where the borrowed external pointer // is ephemeral. SEXP borrow_array_child_xptr(SEXP array_xptr, int64_t i); // Returns the underlying struct ArrowArray* from an external pointer, // checking and erroring for invalid objects, pointers, and arrays, but // allowing for R_NilValue to signify a NULL return. static inline struct ArrowArray* nullable_nanoarrow_array_from_xptr(SEXP array_xptr) { if (array_xptr == R_NilValue) { return NULL; } else { return nanoarrow_array_from_xptr(array_xptr); } } // Attaches a schema to an array external pointer. The nanoarrow R package // attempts to do this whenever possible to avoid misinterpreting arrays. static inline void array_xptr_set_schema(SEXP array_xptr, SEXP schema_xptr) { R_SetExternalPtrTag(array_xptr, schema_xptr); } static inline SEXP array_xptr_get_schema(SEXP array_xptr) { return R_ExternalPtrTag(array_xptr); } // Retrieves a schema from an array external pointer if it exists or returns // NULL otherwise. static inline struct ArrowSchema* schema_from_array_xptr(SEXP array_xptr) { SEXP maybe_schema_xptr = R_ExternalPtrTag(array_xptr); if (Rf_inherits(maybe_schema_xptr, "nanoarrow_schema")) { return (struct ArrowSchema*)R_ExternalPtrAddr(maybe_schema_xptr); } else { return NULL; } } static inline SEXP array_ensure_independent(struct ArrowArray* array); static inline SEXP array_xptr_ensure_independent(SEXP array_xptr); // Exports a version of the array pointed to by array_xptr to array_copy // such that (1) any R references to array_xptr are not invalidated if they exist // and (2) array_copy->release() can be called independently without invalidating // R references to array_xptr. This is a recursive operation (i.e., it will // "explode" the array's children into reference-counted entities where the // reference counting is handled by R's preserve/release infrastructure). // Exported arrays and their children have the important property that they // (and their children) are allocated using nanoarrow's ArrowArrayInit, meaning // we can modify them safely (i.e., using ArrowArraySetBuffer()). static inline void array_export(SEXP array_xptr, struct ArrowArray* array_copy) { // If array_xptr has SEXP dependencies (most commonly this would occur if it's // a borrowed child of a struct array), this will ensure a version that can be // released independently of its parent. SEXP independent_array_xptr = PROTECT(array_xptr_ensure_independent(array_xptr)); struct ArrowArray* array = nanoarrow_array_from_xptr(independent_array_xptr); int result = ArrowArrayInitFromType(array_copy, NANOARROW_TYPE_UNINITIALIZED); if (result != NANOARROW_OK) { Rf_error("ArrowArrayInitFromType() failed"); } array_copy->length = array->length; array_copy->null_count = array->null_count; array_copy->offset = array->offset; // Get buffer references, each of which preserve a reference to independent_array_xptr array_copy->n_buffers = array->n_buffers; for (int64_t i = 0; i < array->n_buffers; i++) { SEXP borrowed_buffer = PROTECT(buffer_borrowed_xptr(array->buffers[i], 0, independent_array_xptr)); result = ArrowArraySetBuffer(array_copy, i, (struct ArrowBuffer*)R_ExternalPtrAddr(borrowed_buffer)); if (result != NANOARROW_OK) { array_copy->release(array_copy); Rf_error("ArrowArraySetBuffer() failed"); } UNPROTECT(1); } // Swap out any children for independently releasable children and export them // into array_copy->children result = ArrowArrayAllocateChildren(array_copy, array->n_children); if (result != NANOARROW_OK) { array_copy->release(array_copy); Rf_error("ArrowArrayAllocateChildren() failed"); } for (int64_t i = 0; i < array->n_children; i++) { SEXP independent_child = PROTECT(array_ensure_independent(array->children[i])); array_export(independent_child, array_copy->children[i]); UNPROTECT(1); } if (array->dictionary != NULL) { result = ArrowArrayAllocateDictionary(array_copy); if (result != NANOARROW_OK) { array_copy->release(array_copy); Rf_error("ArrowArrayAllocateDictionary() failed"); } SEXP independent_dictionary = PROTECT(array_ensure_independent(array->dictionary)); array_export(independent_dictionary, array_copy->dictionary); UNPROTECT(1); } UNPROTECT(1); } // When arrays arrive as a nanoarrow_array, they are responsible for // releasing their children. This is fine until we need to keep one // child alive (e.g., a column of a data frame that we attach to an // ALTREP array) or until we need to export it (i.e., comply with // https://arrow.apache.org/docs/format/CDataInterface.html#moving-child-arrays // where child arrays must be movable). To make this work we need to do a shuffle: we // move the child array to a new owning external pointer and // give an exported version back to the original object. This only // applies if the array_xptr has the external pointer 'prot' field // set (if it doesn't have that set, it is already independent). static inline SEXP array_ensure_independent(struct ArrowArray* array) { SEXP original_array_xptr = PROTECT(nanoarrow_array_owning_xptr()); // Move array to the newly created owner struct ArrowArray* original_array = nanoarrow_output_array_from_xptr(original_array_xptr); memcpy(original_array, array, sizeof(struct ArrowArray)); array->release = NULL; // Export the independent array (which keeps a reference to original_array_xptr) // back to the original home array_export(original_array_xptr, array); // Return the external pointer of the independent array UNPROTECT(1); return original_array_xptr; } // This version is like the version that operates on a raw struct ArrowArray* // except it checks if this array has any array dependencies by inspecing the 'Protected' // field of the external pointer: if it that field is R_NilValue, it is already // independent. static inline SEXP array_xptr_ensure_independent(SEXP array_xptr) { struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); if (R_ExternalPtrProtected(array_xptr) == R_NilValue) { return array_xptr; } return array_ensure_independent(array); } #endif nanoarrow/src/Makevars0000644000176200001440000000146114547575511014563 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. PKG_CPPFLAGS=-I../inst/include nanoarrow/src/materialize_unspecified.h0000644000176200001440000000424014502402562020104 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_UNSPECIFIED_H_INCLUDED #define R_MATERIALIZE_UNSPECIFIED_H_INCLUDED #include #include #include "materialize_common.h" #include "nanoarrow.h" static inline int nanoarrow_materialize_unspecified(struct ArrayViewSlice* src, struct VectorSlice* dst, struct MaterializeOptions* options) { if (src->array_view->array->dictionary != NULL) { return ENOTSUP; } int* result = LOGICAL(dst->vec_sexp); int64_t total_offset = src->array_view->array->offset + src->offset; int64_t length = src->length; const uint8_t* bits = src->array_view->buffer_views[0].data.as_uint8; if (length == 0 || src->array_view->storage_type == NANOARROW_TYPE_NA || ArrowBitCountSet(bits, total_offset, length) == 0) { // We can blindly set all the values to NA_LOGICAL without checking for (int64_t i = 0; i < length; i++) { result[dst->offset + i] = NA_LOGICAL; } } else { // Count non-null values and warn int64_t n_bad_values = 0; for (int64_t i = 0; i < length; i++) { n_bad_values += ArrowBitGet(bits, total_offset + i); result[dst->offset + i] = NA_LOGICAL; } if (n_bad_values > 0) { warn_lossy_conversion(n_bad_values, "that were non-null set to NA"); } } return NANOARROW_OK; } #endif nanoarrow/src/convert_array_stream.c0000644000176200001440000000702414547575511017465 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "nanoarrow.h" #include "array.h" #include "array_stream.h" #include "convert.h" #include "schema.h" SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp, SEXP size_sexp, SEXP n_sexp) { struct ArrowArrayStream* array_stream = nanoarrow_array_stream_from_xptr(array_stream_xptr); int64_t size = (int64_t)(REAL(size_sexp)[0]); double n_real = REAL(n_sexp)[0]; int n; if (R_FINITE(n_real)) { n = (int)n_real; } else { n = INT_MAX; } SEXP schema_xptr = PROTECT(nanoarrow_schema_owning_xptr()); struct ArrowSchema* schema = nanoarrow_output_schema_from_xptr(schema_xptr); int result = ArrowArrayStreamGetSchema(array_stream, schema, NULL); if (result != NANOARROW_OK) { Rf_error("ArrowArrayStream::get_schema(): %s", ArrowArrayStreamGetLastError(array_stream)); } SEXP converter_xptr = PROTECT(nanoarrow_converter_from_ptype(ptype_sexp)); if (nanoarrow_converter_set_schema(converter_xptr, schema_xptr) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } if (nanoarrow_converter_reserve(converter_xptr, size) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } SEXP array_xptr = PROTECT(nanoarrow_array_owning_xptr()); struct ArrowArray* array = nanoarrow_output_array_from_xptr(array_xptr); int64_t n_batches = 0; int64_t n_materialized = 0; if (n > 0) { result = ArrowArrayStreamGetNext(array_stream, array, NULL); n_batches++; if (result != NANOARROW_OK) { Rf_error("ArrowArrayStream::get_next(): %s", ArrowArrayStreamGetLastError(array_stream)); } while (array->release != NULL) { if (nanoarrow_converter_set_array(converter_xptr, array_xptr) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } n_materialized = nanoarrow_converter_materialize_n(converter_xptr, array->length); if (n_materialized != array->length) { Rf_error("Expected to materialize %ld values in batch %ld but materialized %ld", (long)array->length, (long)n_batches, (long)n_materialized); } if (n_batches >= n) { break; } array->release(array); result = ArrowArrayStreamGetNext(array_stream, array, NULL); n_batches++; if (result != NANOARROW_OK) { Rf_error("ArrowArrayStream::get_next(): %s", ArrowArrayStreamGetLastError(array_stream)); } } } if (nanoarrow_converter_finalize(converter_xptr) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } SEXP result_sexp = PROTECT(nanoarrow_converter_release_result(converter_xptr)); UNPROTECT(4); return result_sexp; } nanoarrow/src/array.c0000644000176200001440000003632114547575511014354 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include #include "array.h" #include "buffer.h" #include "nanoarrow.h" #include "schema.h" #include "util.h" SEXP nanoarrow_c_array_init(SEXP schema_xptr) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); SEXP array_xptr = PROTECT(nanoarrow_array_owning_xptr()); struct ArrowArray* array = nanoarrow_output_array_from_xptr(array_xptr); struct ArrowError error; int result = ArrowArrayInitFromSchema(array, schema, &error); if (result != NANOARROW_OK) { Rf_error("ArrowArrayInitFromSchema(): %s", error.message); } array_xptr_set_schema(array_xptr, schema_xptr); UNPROTECT(1); return array_xptr; } SEXP nanoarrow_c_array_set_length(SEXP array_xptr, SEXP length_sexp) { struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); if (TYPEOF(length_sexp) != REALSXP || Rf_length(length_sexp) != 1) { Rf_error("array$length must be double(1)"); } double length = REAL(length_sexp)[0]; if (ISNA(length) || ISNAN(length) || length < 0) { Rf_error("array$length must be finite and greater than zero"); } array->length = (int64_t)length; return R_NilValue; } SEXP nanoarrow_c_array_set_null_count(SEXP array_xptr, SEXP null_count_sexp) { struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); if (TYPEOF(null_count_sexp) != REALSXP || Rf_length(null_count_sexp) != 1) { Rf_error("array$null_count must be double(1)"); } double null_count = REAL(null_count_sexp)[0]; if (ISNA(null_count) || ISNAN(null_count) || null_count < -1) { Rf_error("array$null_count must be finite and greater than -1"); } array->null_count = (int64_t)null_count; return R_NilValue; } SEXP nanoarrow_c_array_set_offset(SEXP array_xptr, SEXP offset_sexp) { struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); if (TYPEOF(offset_sexp) != REALSXP || Rf_length(offset_sexp) != 1) { Rf_error("array$offset must be double(1)"); } double offset = REAL(offset_sexp)[0]; if (ISNA(offset) || ISNAN(offset) || offset < 0) { Rf_error("array$offset must be finite and greater than zero"); } array->offset = (int64_t)offset; return R_NilValue; } SEXP nanoarrow_c_array_set_buffers(SEXP array_xptr, SEXP buffers_sexp) { struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); int64_t n_buffers = Rf_xlength(buffers_sexp); if (n_buffers > 3) { Rf_error("length(array$buffers) must be <= 3"); } // Release any buffers that aren't about to be replaced for (int64_t i = n_buffers; i < array->n_buffers; i++) { ArrowBufferReset(ArrowArrayBuffer(array, i)); } array->n_buffers = n_buffers; for (int64_t i = 0; i < n_buffers; i++) { SEXP buffer_xptr = VECTOR_ELT(buffers_sexp, i); struct ArrowBuffer* src = buffer_from_xptr(buffer_xptr); // We can't necessarily ArrowBufferMove(src) because that buffer might // have been pointed at by something else. So, we do this slightly awkward // dance to make sure buffer_xptr stays valid after this call. SEXP buffer_xptr_clone = PROTECT(buffer_borrowed_xptr(src->data, src->size_bytes, buffer_xptr)); struct ArrowBuffer* src_clone = (struct ArrowBuffer*)R_ExternalPtrAddr(buffer_xptr_clone); // Release whatever buffer is currently there and replace it with src_clone ArrowBufferReset(ArrowArrayBuffer(array, i)); int result = ArrowArraySetBuffer(array, i, src_clone); if (result != NANOARROW_OK) { Rf_error("ArrowArraySetBuffer() failed"); } UNPROTECT(1); } return R_NilValue; } static void release_all_children(struct ArrowArray* array) { for (int64_t i = 0; i < array->n_children; i++) { if (array->children[i]->release != NULL) { array->children[i]->release(array->children[i]); } } } static void free_all_children(struct ArrowArray* array) { for (int64_t i = 0; i < array->n_children; i++) { if (array->children[i] != NULL) { ArrowFree(array->children[i]); array->children[i] = NULL; } } if (array->children != NULL) { ArrowFree(array->children); array->children = NULL; } array->n_children = 0; } SEXP nanoarrow_c_array_set_children(SEXP array_xptr, SEXP children_sexp) { struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); release_all_children(array); if (Rf_xlength(children_sexp) == 0) { free_all_children(array); return R_NilValue; } if (Rf_xlength(children_sexp) != array->n_children) { free_all_children(array); int result = ArrowArrayAllocateChildren(array, Rf_xlength(children_sexp)); if (result != NANOARROW_OK) { Rf_error("Error allocating array$children of size %ld", (long)Rf_xlength(children_sexp)); } } for (int64_t i = 0; i < array->n_children; i++) { // The arrays here will be moved, invalidating the arrays in the passed // list (the export step is handled in R) SEXP child_xptr = VECTOR_ELT(children_sexp, i); struct ArrowArray* child = nanoarrow_array_from_xptr(child_xptr); ArrowArrayMove(child, array->children[i]); } return R_NilValue; } SEXP nanoarrow_c_array_set_dictionary(SEXP array_xptr, SEXP dictionary_xptr) { struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); // If there's already a dictionary, make sure we release it if (array->dictionary != NULL) { if (array->dictionary->release != NULL) { array->dictionary->release(array->dictionary); } } if (dictionary_xptr == R_NilValue) { if (array->dictionary != NULL) { ArrowFree(array->dictionary); array->dictionary = NULL; } } else { if (array->dictionary == NULL) { int result = ArrowArrayAllocateDictionary(array); if (result != NANOARROW_OK) { Rf_error("Error allocating array$dictionary"); } } struct ArrowArray* dictionary = nanoarrow_array_from_xptr(dictionary_xptr); ArrowArrayMove(dictionary, array->dictionary); } return R_NilValue; } static int move_array_buffers(struct ArrowArray* src, struct ArrowArray* dst, struct ArrowSchema* schema, struct ArrowError* error) { error->message[0] = '\0'; dst->length = src->length; dst->null_count = src->null_count; dst->offset = src->offset; if (src->n_buffers != dst->n_buffers) { ArrowErrorSet(error, "Expected %ld buffer(s) but got %ld", (long)dst->n_buffers, (long)src->n_buffers); return EINVAL; } for (int64_t i = 0; i < src->n_buffers; i++) { NANOARROW_RETURN_NOT_OK(ArrowArraySetBuffer(dst, i, ArrowArrayBuffer(src, i))); } if (src->n_children != dst->n_children) { ArrowErrorSet(error, "Expected %ld child(ren) but got %ld", (long)dst->n_children, (long)src->n_children); return EINVAL; } for (int64_t i = 0; i < src->n_children; i++) { NANOARROW_RETURN_NOT_OK(move_array_buffers(src->children[i], dst->children[i], schema->children[i], error)); } if (src->dictionary != NULL) { NANOARROW_RETURN_NOT_OK( move_array_buffers(src->dictionary, dst->dictionary, schema->dictionary, error)); } return NANOARROW_OK; } SEXP nanoarrow_c_array_validate_after_modify(SEXP array_xptr, SEXP schema_xptr) { // A very particular type of validation we can do with the ArrowArray we use // in nanoarrow_array_modify() (which was created using ArrowArrayInit). // At this point we know how long each buffer is (via ArrowArrayBuffer()) // but after we send the array into the wild, that information is lost. // This operation will invalidate array_xptr (but this is OK since we very // specifically just allocated it). struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); struct ArrowError error; // Even though array was initialized using ArrowArrayInit(), it doesn't have // all the information about storage types since it didn't necessarily know // what the storage type would be when it was being constructed. Here we create // a version that does and move buffers recursively into it. SEXP array_dst_xptr = PROTECT(nanoarrow_array_owning_xptr()); struct ArrowArray* array_dst = nanoarrow_output_array_from_xptr(array_dst_xptr); int result = ArrowArrayInitFromSchema(array_dst, schema, &error); if (result != NANOARROW_OK) { Rf_error("ArrowArrayInitFromSchema(): %s", error.message); } result = move_array_buffers(array, array_dst, schema, &error); if (result != NANOARROW_OK) { Rf_error("move_array_buffers: %s", error.message); } result = ArrowArrayFinishBuildingDefault(array_dst, &error); if (result != NANOARROW_OK) { Rf_error("ArrowArrayFinishBuildingDefault(): %s", error.message); } UNPROTECT(1); return array_dst_xptr; } SEXP nanoarrow_c_array_set_schema(SEXP array_xptr, SEXP schema_xptr, SEXP validate_sexp) { // Fair game to remove a schema from a pointer if (schema_xptr == R_NilValue) { array_xptr_set_schema(array_xptr, R_NilValue); return R_NilValue; } int validate = LOGICAL(validate_sexp)[0]; if (validate) { // If adding a schema, validate the schema and the pair struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); struct ArrowArrayView array_view; struct ArrowError error; int result = ArrowArrayViewInitFromSchema(&array_view, schema, &error); if (result != NANOARROW_OK) { ArrowArrayViewReset(&array_view); Rf_error("%s", ArrowErrorMessage(&error)); } result = ArrowArrayViewSetArray(&array_view, array, &error); ArrowArrayViewReset(&array_view); if (result != NANOARROW_OK) { Rf_error("%s", ArrowErrorMessage(&error)); } } array_xptr_set_schema(array_xptr, schema_xptr); return R_NilValue; } SEXP nanoarrow_c_infer_schema_array(SEXP array_xptr) { SEXP maybe_schema_xptr = R_ExternalPtrTag(array_xptr); if (Rf_inherits(maybe_schema_xptr, "nanoarrow_schema")) { return maybe_schema_xptr; } else { return R_NilValue; } } static SEXP borrow_array_xptr(struct ArrowArray* array, SEXP shelter) { SEXP array_xptr = PROTECT(R_MakeExternalPtr(array, R_NilValue, shelter)); Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array); UNPROTECT(1); return array_xptr; } SEXP borrow_array_child_xptr(SEXP array_xptr, int64_t i) { struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); SEXP schema_xptr = R_ExternalPtrTag(array_xptr); SEXP child_xptr = PROTECT(borrow_array_xptr(array->children[i], array_xptr)); if (schema_xptr != R_NilValue) { array_xptr_set_schema(child_xptr, borrow_schema_child_xptr(schema_xptr, i)); } UNPROTECT(1); return child_xptr; } static SEXP borrow_array_view_child(struct ArrowArrayView* array_view, int64_t i, SEXP shelter) { if (array_view != NULL) { return R_MakeExternalPtr(array_view->children[i], R_NilValue, shelter); } else { return R_NilValue; } } static SEXP borrow_array_view_dictionary(struct ArrowArrayView* array_view, SEXP shelter) { if (array_view != NULL) { return R_MakeExternalPtr(array_view->dictionary, R_NilValue, shelter); } else { return R_NilValue; } } static SEXP borrow_unknown_buffer(struct ArrowArray* array, int64_t i, SEXP shelter) { return buffer_borrowed_xptr(array->buffers[i], 0, shelter); } static SEXP borrow_buffer(struct ArrowArrayView* array_view, int64_t i, SEXP shelter) { SEXP buffer_class = PROTECT(Rf_allocVector(STRSXP, 2)); SET_STRING_ELT(buffer_class, 1, Rf_mkChar("nanoarrow_buffer")); SEXP buffer_xptr = PROTECT(buffer_borrowed_xptr(array_view->buffer_views[i].data.data, array_view->buffer_views[i].size_bytes, shelter)); buffer_borrowed_xptr_set_type(buffer_xptr, array_view->layout.buffer_type[i], array_view->layout.buffer_data_type[i], array_view->layout.element_size_bits[i]); UNPROTECT(2); return buffer_xptr; } SEXP nanoarrow_c_array_proxy(SEXP array_xptr, SEXP array_view_xptr, SEXP recursive_sexp) { struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); int recursive = LOGICAL(recursive_sexp)[0]; struct ArrowArrayView* array_view = NULL; if (array_view_xptr != R_NilValue) { array_view = (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr); } const char* names[] = {"length", "null_count", "offset", "buffers", "children", "dictionary", ""}; SEXP array_proxy = PROTECT(Rf_mkNamed(VECSXP, names)); SET_VECTOR_ELT(array_proxy, 0, length_sexp_from_int64(array->length)); SET_VECTOR_ELT(array_proxy, 1, length_sexp_from_int64(array->null_count)); SET_VECTOR_ELT(array_proxy, 2, length_sexp_from_int64(array->offset)); if (array->n_buffers > 0) { SEXP buffers = PROTECT(Rf_allocVector(VECSXP, array->n_buffers)); for (int64_t i = 0; i < array->n_buffers; i++) { if (array_view != NULL) { SET_VECTOR_ELT(buffers, i, borrow_buffer(array_view, i, array_xptr)); } else { SET_VECTOR_ELT(buffers, i, borrow_unknown_buffer(array, i, array_xptr)); } } SET_VECTOR_ELT(array_proxy, 3, buffers); UNPROTECT(1); } if (array->n_children > 0) { SEXP children = PROTECT(Rf_allocVector(VECSXP, array->n_children)); for (int64_t i = 0; i < array->n_children; i++) { SEXP child = PROTECT(borrow_array_xptr(array->children[i], array_xptr)); if (recursive) { SEXP array_view_child = PROTECT(borrow_array_view_child(array_view, i, array_view_xptr)); SET_VECTOR_ELT(children, i, nanoarrow_c_array_proxy(child, array_view_child, recursive_sexp)); UNPROTECT(1); } else { SET_VECTOR_ELT(children, i, child); } UNPROTECT(1); } SET_VECTOR_ELT(array_proxy, 4, children); UNPROTECT(1); } if (array->dictionary != NULL) { SEXP dictionary_xptr = PROTECT(borrow_array_xptr(array->dictionary, array_xptr)); if (recursive) { SEXP dictionary_view_xptr = PROTECT(borrow_array_view_dictionary(array_view, array_view_xptr)); SEXP dictionary_proxy = PROTECT( nanoarrow_c_array_proxy(dictionary_xptr, dictionary_view_xptr, recursive_sexp)); SET_VECTOR_ELT(array_proxy, 5, dictionary_proxy); UNPROTECT(2); } else { SET_VECTOR_ELT(array_proxy, 5, dictionary_xptr); } UNPROTECT(1); } UNPROTECT(1); return array_proxy; } nanoarrow/src/array_stream.h0000644000176200001440000000214514547575511015731 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_NANOARROW_ARRAY_STREAM_H_INCLUDED #define R_NANOARROW_ARRAY_STREAM_H_INCLUDED #include #include #include #include "nanoarrow.h" #include "util.h" void array_stream_export(SEXP array_stream_xptr, struct ArrowArrayStream* array_stream_copy); #endif nanoarrow/src/array_view.h0000644000176200001440000000406014502402562015370 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_NANOARROW_ARRAY_VIEW_H_INCLUDED #define R_NANOARROW_ARRAY_VIEW_H_INCLUDED #include #include #include "nanoarrow.h" // Creates an external pointer to a struct ArrowArrayView, erroring // if the validation inherent in its creation fails (i.e., calling // this will also validate the array). This requires that array_xptr // has a schema attached. The ArrowArrayView is an augmented structure // provided by the nanoarrow C library that makes it easier to access // elements and buffers. This is not currently exposed at the R // level but is used at the C level to make validation and conversion // to R easier to write. SEXP array_view_xptr_from_array_xptr(SEXP array_xptr); // Returns the struct ArrowArrayView underlying an external pointer, // erroring for invalid objects and NULL pointers. static inline struct ArrowArrayView* array_view_from_xptr(SEXP array_view_xptr) { if (!Rf_inherits(array_view_xptr, "nanoarrow_array_view")) { Rf_error("`array_view` argument that is not a nanoarrow_array_view()"); } struct ArrowArrayView* array_view = (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr); if (array_view == NULL) { Rf_error("nanoarrow_array_view() is an external pointer to NULL"); } return array_view; } #endif nanoarrow/src/as_array.c0000644000176200001440000004410014547575511015031 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include #include "array.h" #include "buffer.h" #include "materialize.h" #include "nanoarrow.h" #include "schema.h" #include "util.h" static void call_as_nanoarrow_array(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr, const char* fun_name) { SEXP fun = PROTECT(Rf_install(fun_name)); SEXP call = PROTECT(Rf_lang3(fun, x_sexp, schema_xptr)); SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); // In many cases we can skip the array_export() step (which adds some complexity // and an additional R object to the mix) if (Rf_inherits(result, "nanoarrow_array_dont_export")) { struct ArrowArray* array_result = nanoarrow_array_from_xptr(result); ArrowArrayMove(array_result, array); } else { array_export(result, array); } UNPROTECT(3); } static void as_array_int(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr, struct ArrowSchemaView* schema_view, struct ArrowError* error) { // Only consider the default create for now if (schema_view->type != NANOARROW_TYPE_INT32) { call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c"); return; } // We don't consider altrep for now: we need an array of int32_t, and while we // *could* avoid materializing, there's no point because the source altrep // object almost certainly knows how to do this faster than we do. int* x_data = INTEGER(x_sexp); int64_t len = Rf_xlength(x_sexp); int result = ArrowArrayInitFromType(array, NANOARROW_TYPE_INT32); if (result != NANOARROW_OK) { Rf_error("ArrowArrayInitFromType() failed"); } // Borrow the data buffer buffer_borrowed(ArrowArrayBuffer(array, 1), x_data, len * sizeof(int32_t), x_sexp); // Set the array fields array->length = len; array->offset = 0; int64_t null_count = 0; // Look for the first null (will be the last index if there are none) int64_t first_null = -1; for (int64_t i = 0; i < len; i++) { if (x_data[i] == NA_INTEGER) { first_null = i; break; } } // If there are nulls, pack the validity buffer if (first_null != -1) { struct ArrowBitmap bitmap; ArrowBitmapInit(&bitmap); result = ArrowBitmapReserve(&bitmap, len); if (result != NANOARROW_OK) { Rf_error("ArrowBitmapReserve() failed"); } ArrowBitmapAppendUnsafe(&bitmap, 1, first_null); for (int64_t i = first_null; i < len; i++) { uint8_t is_valid = x_data[i] != NA_INTEGER; null_count += !is_valid; ArrowBitmapAppendUnsafe(&bitmap, is_valid, 1); } ArrowArraySetValidityBitmap(array, &bitmap); } array->null_count = null_count; result = ArrowArrayFinishBuildingDefault(array, error); if (result != NANOARROW_OK) { Rf_error("ArrowArrayFinishBuildingDefault(): %s", error->message); } } static void as_array_lgl(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr, struct ArrowSchemaView* schema_view, struct ArrowError* error) { // We can zero-copy convert to int32 if (schema_view->type == NANOARROW_TYPE_INT32) { as_array_int(x_sexp, array, schema_xptr, schema_view, error); return; } // Only consider bool for now if (schema_view->type != NANOARROW_TYPE_BOOL) { call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c"); return; } int* x_data = INTEGER(x_sexp); int64_t len = Rf_xlength(x_sexp); int result = ArrowArrayInitFromType(array, NANOARROW_TYPE_BOOL); if (result != NANOARROW_OK) { Rf_error("ArrowArrayInitFromType() failed"); } struct ArrowBitmap value_bitmap; ArrowBitmapInit(&value_bitmap); result = ArrowBitmapReserve(&value_bitmap, len); if (result != NANOARROW_OK) { Rf_error("ArrowBitmapReserve() failed"); } int has_nulls = 0; for (int64_t i = 0; i < len; i++) { if (x_data[i] == NA_INTEGER) { has_nulls = 1; ArrowBitmapAppendUnsafe(&value_bitmap, 0, 1); } else { ArrowBitmapAppendUnsafe(&value_bitmap, x_data[i] != 0, 1); } } result = ArrowArraySetBuffer(array, 1, &value_bitmap.buffer); if (result != NANOARROW_OK) { Rf_error("ArrowArraySetBuffer() failed"); } // Set the array fields array->length = len; array->offset = 0; int64_t null_count = 0; // If there are nulls, pack the validity buffer if (has_nulls) { struct ArrowBitmap bitmap; ArrowBitmapInit(&bitmap); result = ArrowBitmapReserve(&bitmap, len); if (result != NANOARROW_OK) { Rf_error("ArrowBitmapReserve() failed"); } for (int64_t i = 0; i < len; i++) { uint8_t is_valid = x_data[i] != NA_INTEGER; null_count += !is_valid; ArrowBitmapAppendUnsafe(&bitmap, is_valid, 1); } ArrowArraySetValidityBitmap(array, &bitmap); } array->null_count = null_count; result = ArrowArrayFinishBuildingDefault(array, error); if (result != NANOARROW_OK) { Rf_error("ArrowArrayFinishBuildingDefault(): %s", error->message); } } static void as_array_dbl(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr, struct ArrowSchemaView* schema_view, struct ArrowError* error) { // Consider double -> na_double() and double -> na_int64()/na_int32() // (mostly so that we can support date/time types with various units) switch (schema_view->type) { case NANOARROW_TYPE_DOUBLE: case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_INT32: break; default: call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c"); return; } double* x_data = REAL(x_sexp); int64_t len = Rf_xlength(x_sexp); int result = ArrowArrayInitFromType(array, schema_view->type); if (result != NANOARROW_OK) { Rf_error("ArrowArrayInitFromType() failed"); } if (schema_view->type == NANOARROW_TYPE_DOUBLE) { // Just borrow the data buffer (zero-copy) buffer_borrowed(ArrowArrayBuffer(array, 1), x_data, len * sizeof(double), x_sexp); } else if (schema_view->type == NANOARROW_TYPE_INT64) { // double -> int64_t struct ArrowBuffer* buffer = ArrowArrayBuffer(array, 1); result = ArrowBufferReserve(buffer, len * sizeof(int64_t)); if (result != NANOARROW_OK) { Rf_error("ArrowBufferReserve() failed"); } int64_t* buffer_data = (int64_t*)buffer->data; for (int64_t i = 0; i < len; i++) { // UBSAN warns for buffer_data[i] = nan if (R_IsNA(x_data[i]) || R_IsNaN(x_data[i])) { buffer_data[i] = 0; } else { buffer_data[i] = (int64_t)x_data[i]; } } buffer->size_bytes = len * sizeof(int64_t); } else { // double -> int32_t struct ArrowBuffer* buffer = ArrowArrayBuffer(array, 1); result = ArrowBufferReserve(buffer, len * sizeof(int32_t)); if (result != NANOARROW_OK) { Rf_error("ArrowBufferReserve() failed"); } int32_t* buffer_data = (int32_t*)buffer->data; // It's easy to accidentally overflow here, so make sure to warn int64_t n_overflow = 0; for (int64_t i = 0; i < len; i++) { // UBSAN warns for buffer_data[i] = nan if (R_IsNA(x_data[i]) || R_IsNaN(x_data[i])) { buffer_data[i] = 0; } else if (x_data[i] > INT_MAX || x_data[i] < INT_MIN) { n_overflow++; buffer_data[i] = 0; } else { buffer_data[i] = (int32_t)x_data[i]; } } if (n_overflow > 0) { warn_lossy_conversion(n_overflow, "overflowed in double -> na_int32() creation"); } buffer->size_bytes = len * sizeof(int32_t); } // Set the array fields array->length = len; array->offset = 0; int64_t null_count = 0; // Look for the first null (will be the last index if there are none) int64_t first_null = -1; for (int64_t i = 0; i < len; i++) { if (R_IsNA(x_data[i]) || R_IsNaN(x_data[i])) { first_null = i; break; } } // If there are nulls, pack the validity buffer if (first_null != -1) { struct ArrowBitmap bitmap; ArrowBitmapInit(&bitmap); result = ArrowBitmapReserve(&bitmap, len); if (result != NANOARROW_OK) { Rf_error("ArrowBitmapReserve() failed"); } ArrowBitmapAppendUnsafe(&bitmap, 1, first_null); for (int64_t i = first_null; i < len; i++) { uint8_t is_valid = !R_IsNA(x_data[i]) && !R_IsNaN(x_data[i]); null_count += !is_valid; ArrowBitmapAppendUnsafe(&bitmap, is_valid, 1); } ArrowArraySetValidityBitmap(array, &bitmap); } array->null_count = null_count; result = ArrowArrayFinishBuildingDefault(array, error); if (result != NANOARROW_OK) { Rf_error("ArrowArrayFinishBuildingDefault(): %s", error->message); } } static void as_array_chr(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr, struct ArrowSchemaView* schema_view, struct ArrowError* error) { // Only consider the default create for now if (schema_view->type != NANOARROW_TYPE_STRING) { call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c"); return; } int64_t len = Rf_xlength(x_sexp); int result = ArrowArrayInitFromType(array, NANOARROW_TYPE_STRING); if (result != NANOARROW_OK) { Rf_error("ArrowArrayInitFromType() failed"); } // Keep these buffers under the umbrella of the array so that we don't have // to worry about cleaning them up if STRING_ELT jumps struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 2); result = ArrowBufferReserve(offset_buffer, (len + 1) * sizeof(int32_t)); if (result != NANOARROW_OK) { Rf_error("ArrowBufferReserve() failed"); } int64_t null_count = 0; int32_t cumulative_len = 0; ArrowBufferAppendUnsafe(offset_buffer, &cumulative_len, sizeof(int32_t)); for (int64_t i = 0; i < len; i++) { SEXP item = STRING_ELT(x_sexp, i); if (item == NA_STRING) { null_count++; } else { const void* vmax = vmaxget(); const char* item_utf8 = Rf_translateCharUTF8(item); int64_t item_size = strlen(item_utf8); if ((item_size + cumulative_len) > INT_MAX) { Rf_error("Use na_large_string() to convert character() with total size > 2GB"); } int result = ArrowBufferAppend(data_buffer, item_utf8, item_size); if (result != NANOARROW_OK) { Rf_error("ArrowBufferAppend() failed"); } cumulative_len += (int32_t)item_size; vmaxset(vmax); } ArrowBufferAppendUnsafe(offset_buffer, &cumulative_len, sizeof(int32_t)); } // Set the array fields array->length = len; array->offset = 0; // If there are nulls, pack the validity buffer if (null_count > 0) { struct ArrowBitmap bitmap; ArrowBitmapInit(&bitmap); result = ArrowBitmapReserve(&bitmap, len); if (result != NANOARROW_OK) { Rf_error("ArrowBitmapReserve() failed"); } for (int64_t i = 0; i < len; i++) { uint8_t is_valid = STRING_ELT(x_sexp, i) != NA_STRING; ArrowBitmapAppendUnsafe(&bitmap, is_valid, 1); } ArrowArraySetValidityBitmap(array, &bitmap); } array->null_count = null_count; result = ArrowArrayFinishBuildingDefault(array, error); if (result != NANOARROW_OK) { Rf_error("ArrowArrayFinishBuildingDefault(): %s", error->message); } } static void as_array_default(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr, struct ArrowError* error); static void as_array_data_frame(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr, struct ArrowSchemaView* schema_view, struct ArrowError* error) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); switch (schema_view->type) { case NANOARROW_TYPE_SPARSE_UNION: case NANOARROW_TYPE_DENSE_UNION: call_as_nanoarrow_array(x_sexp, array, schema_xptr, "union_array_from_data_frame"); return; case NANOARROW_TYPE_STRUCT: break; default: call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c"); return; } if (Rf_xlength(x_sexp) != schema->n_children) { Rf_error("Expected %ld schema children but found %ld", (long)Rf_xlength(x_sexp), (long)schema->n_children); } int result = ArrowArrayInitFromType(array, NANOARROW_TYPE_STRUCT); if (result != NANOARROW_OK) { Rf_error("ArrowArrayInitFromType() failed"); } result = ArrowArrayAllocateChildren(array, schema->n_children); if (result != NANOARROW_OK) { Rf_error("ArrowArrayAllocateChildren() failed"); } for (int64_t i = 0; i < schema->n_children; i++) { SEXP child_xptr = PROTECT(borrow_schema_child_xptr(schema_xptr, i)); as_array_default(VECTOR_ELT(x_sexp, i), array->children[i], child_xptr, error); UNPROTECT(1); } array->length = nanoarrow_data_frame_size(x_sexp); array->null_count = 0; array->offset = 0; } static void as_array_list(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr, struct ArrowSchemaView* schema_view, struct ArrowError* error) { // We handle list(raw()) in C but fall back to S3 for other types of list output. // Arbitrary nested list support is complicated in C without some concept of a // "builder", which we don't use. if (schema_view->type != NANOARROW_TYPE_BINARY) { call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c"); return; } int result = ArrowArrayInitFromType(array, schema_view->type); if (result != NANOARROW_OK) { Rf_error("ArrowArrayInitFromType() failed"); } int64_t len = Rf_xlength(x_sexp); struct ArrowBuffer* offset_buffer = ArrowArrayBuffer(array, 1); struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 2); result = ArrowBufferReserve(offset_buffer, (len + 1) * sizeof(int32_t)); if (result != NANOARROW_OK) { Rf_error("ArrowBufferReserve() failed"); } int64_t null_count = 0; int32_t cumulative_len = 0; ArrowBufferAppendUnsafe(offset_buffer, &cumulative_len, sizeof(int32_t)); for (int64_t i = 0; i < len; i++) { SEXP item = VECTOR_ELT(x_sexp, i); if (item == R_NilValue) { ArrowBufferAppendUnsafe(offset_buffer, &cumulative_len, sizeof(int32_t)); null_count++; continue; } if (Rf_isObject(item) || TYPEOF(item) != RAWSXP) { Rf_error("All list items must be raw() or NULL in conversion to na_binary()"); } int64_t item_size = Rf_xlength(item); if ((item_size + cumulative_len) > INT_MAX) { Rf_error("Use na_large_binary() to convert list(raw()) with total size > 2GB"); } result = ArrowBufferAppend(data_buffer, RAW(item), item_size); if (result != NANOARROW_OK) { Rf_error("ArrowBufferAppend() failed"); } cumulative_len += (int32_t)item_size; ArrowBufferAppendUnsafe(offset_buffer, &cumulative_len, sizeof(int32_t)); } // Set the array fields array->length = len; array->offset = 0; // If there are nulls, pack the validity buffer if (null_count > 0) { struct ArrowBitmap bitmap; ArrowBitmapInit(&bitmap); result = ArrowBitmapReserve(&bitmap, len); if (result != NANOARROW_OK) { Rf_error("ArrowBitmapReserve() failed"); } for (int64_t i = 0; i < len; i++) { uint8_t is_valid = VECTOR_ELT(x_sexp, i) != R_NilValue; ArrowBitmapAppendUnsafe(&bitmap, is_valid, 1); } ArrowArraySetValidityBitmap(array, &bitmap); } array->null_count = null_count; result = ArrowArrayFinishBuildingDefault(array, error); if (result != NANOARROW_OK) { Rf_error("ArrowArrayFinishBuildingDefault(): %s", error->message); } } static void as_array_default(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr, struct ArrowError* error) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); struct ArrowSchemaView schema_view; int result = ArrowSchemaViewInit(&schema_view, schema, error); if (result != NANOARROW_OK) { Rf_error("ArrowSchemaViewInit(): %s", error->message); } // Ensure that extension types dispatch from R regardless of source if (schema_view.extension_name.size_bytes > 0) { call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c"); return; } if (Rf_isObject(x_sexp)) { if (Rf_inherits(x_sexp, "data.frame")) { as_array_data_frame(x_sexp, array, schema_xptr, &schema_view, error); return; } else { call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c"); return; } } switch (TYPEOF(x_sexp)) { case LGLSXP: as_array_lgl(x_sexp, array, schema_xptr, &schema_view, error); return; case INTSXP: as_array_int(x_sexp, array, schema_xptr, &schema_view, error); return; case REALSXP: as_array_dbl(x_sexp, array, schema_xptr, &schema_view, error); return; case STRSXP: as_array_chr(x_sexp, array, schema_xptr, &schema_view, error); return; case VECSXP: as_array_list(x_sexp, array, schema_xptr, &schema_view, error); return; default: call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c"); return; } } SEXP nanoarrow_c_as_array_default(SEXP x_sexp, SEXP schema_xptr) { SEXP array_xptr = PROTECT(nanoarrow_array_owning_xptr()); struct ArrowArray* array = nanoarrow_output_array_from_xptr(array_xptr); struct ArrowError error; as_array_default(x_sexp, array, schema_xptr, &error); array_xptr_set_schema(array_xptr, schema_xptr); UNPROTECT(1); return array_xptr; } nanoarrow/src/altrep.c0000644000176200001440000002045514547575511014526 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include #include #include "altrep.h" #include "array.h" #include "convert.h" #include "nanoarrow.h" #include "util.h" #ifdef HAS_ALTREP // This file defines all ALTREP classes used to speed up conversion // from an arrow_array to an R vector. Currently only string and // large string arrays are converted to ALTREP. // // All ALTREP classes follow some common patterns: // // - R_altrep_data1() holds an external pointer to a struct RConverter. // - R_altrep_data2() holds the materialized version of the vector. // - When materialization happens, we set R_altrep_data1() to R_NilValue // to ensure we don't hold on to any more resources than needed. static R_xlen_t nanoarrow_altrep_length(SEXP altrep_sexp) { SEXP converter_xptr = R_altrep_data1(altrep_sexp); if (converter_xptr == R_NilValue) { return Rf_xlength(R_altrep_data2(altrep_sexp)); } struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); return converter->array_view.array->length; } static Rboolean nanoarrow_altrep_inspect(SEXP altrep_sexp, int pre, int deep, int pvec, void (*inspect_subtree)(SEXP, int, int, int)) { SEXP converter_xptr = R_altrep_data1(altrep_sexp); const char* materialized = ""; if (converter_xptr == R_NilValue) { materialized = "materialized "; } R_xlen_t len = nanoarrow_altrep_length(altrep_sexp); const char* class_name = nanoarrow_altrep_class(altrep_sexp); Rprintf("<%s%s[%ld]>\n", materialized, class_name, (long)len); return TRUE; } static SEXP nanoarrow_altstring_elt(SEXP altrep_sexp, R_xlen_t i) { SEXP converter_xptr = R_altrep_data1(altrep_sexp); if (converter_xptr == R_NilValue) { return STRING_ELT(R_altrep_data2(altrep_sexp), i); } struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); if (ArrowArrayViewIsNull(&converter->array_view, i)) { return NA_STRING; } struct ArrowStringView item = ArrowArrayViewGetStringUnsafe(&converter->array_view, i); return Rf_mkCharLenCE(item.data, (int)item.size_bytes, CE_UTF8); } static SEXP nanoarrow_altstring_materialize(SEXP altrep_sexp) { SEXP converter_xptr = R_altrep_data1(altrep_sexp); if (converter_xptr == R_NilValue) { return R_altrep_data2(altrep_sexp); } if (nanoarrow_converter_materialize_all(converter_xptr) != NANOARROW_OK) { Rf_error("Error materializing altstring"); } if (nanoarrow_converter_finalize(converter_xptr) != NANOARROW_OK) { Rf_error("Error finalizing materialized altstring"); } SEXP result_sexp = PROTECT(nanoarrow_converter_release_result(converter_xptr)); R_set_altrep_data2(altrep_sexp, result_sexp); R_set_altrep_data1(altrep_sexp, R_NilValue); UNPROTECT(1); return result_sexp; } static void* nanoarrow_altrep_dataptr(SEXP altrep_sexp, Rboolean writable) { return DATAPTR(nanoarrow_altstring_materialize(altrep_sexp)); } static const void* nanoarrow_altrep_dataptr_or_null(SEXP altrep_sexp) { SEXP converter_xptr = R_altrep_data1(altrep_sexp); if (converter_xptr == R_NilValue) { return DATAPTR_OR_NULL(R_altrep_data2(altrep_sexp)); } return NULL; } static R_altrep_class_t nanoarrow_altrep_chr_cls; #endif static void register_nanoarrow_altstring(DllInfo* info) { #ifdef HAS_ALTREP nanoarrow_altrep_chr_cls = R_make_altstring_class("nanoarrow::altrep_chr", "nanoarrow", info); R_set_altrep_Length_method(nanoarrow_altrep_chr_cls, &nanoarrow_altrep_length); R_set_altrep_Inspect_method(nanoarrow_altrep_chr_cls, &nanoarrow_altrep_inspect); R_set_altvec_Dataptr_or_null_method(nanoarrow_altrep_chr_cls, &nanoarrow_altrep_dataptr_or_null); R_set_altvec_Dataptr_method(nanoarrow_altrep_chr_cls, &nanoarrow_altrep_dataptr); R_set_altstring_Elt_method(nanoarrow_altrep_chr_cls, &nanoarrow_altstring_elt); // Notes about other available methods: // // - The no_na method never seems to get called (anyNA() doesn't seem to // use it) // - Because set_Elt is not defined, SET_STRING_ELT() will modify the // technically modify the materialized value. The object has been marked // immutable but in the case of a string this is fine because we materialize // when this happens (via Dataptr). // - It may be beneficial to implement the Extract_subset method to defer string // conversion even longer since this is expensive compared to rearranging integer // indices. // - The duplicate method may be useful because it's used when setting attributes // or unclassing the vector. #endif } void register_nanoarrow_altrep(DllInfo* info) { register_nanoarrow_altstring(info); } SEXP nanoarrow_c_make_altrep_chr(SEXP array_xptr) { #ifdef HAS_ALTREP SEXP schema_xptr = array_xptr_get_schema(array_xptr); // Create the converter SEXP converter_xptr = PROTECT(nanoarrow_converter_from_type(VECTOR_TYPE_CHR)); if (nanoarrow_converter_set_schema(converter_xptr, schema_xptr) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr); switch (converter->array_view.storage_type) { case NANOARROW_TYPE_NA: case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: break; default: UNPROTECT(1); return R_NilValue; } // Ensure the array that we're attaching to this ALTREP object does not keep its // parent struct alive unnecessarily (i.e., a user can select only a few columns // and the memory for the unused columns will be released). SEXP array_xptr_independent = PROTECT(array_xptr_ensure_independent(array_xptr)); if (nanoarrow_converter_set_array(converter_xptr, array_xptr_independent) != NANOARROW_OK) { nanoarrow_converter_stop(converter_xptr); } Rf_setAttrib(converter_xptr, R_ClassSymbol, nanoarrow_cls_altrep_chr); SEXP out = PROTECT(R_new_altrep(nanoarrow_altrep_chr_cls, converter_xptr, R_NilValue)); MARK_NOT_MUTABLE(out); UNPROTECT(3); return out; #else return R_NilValue; #endif } SEXP nanoarrow_c_is_altrep(SEXP x_sexp) { return Rf_ScalarLogical(is_nanoarrow_altrep(x_sexp)); } SEXP nanoarrow_c_altrep_is_materialized(SEXP x_sexp) { const char* class_name = nanoarrow_altrep_class(x_sexp); if (class_name == NULL || strncmp(class_name, "nanoarrow::", 11) != 0) { return Rf_ScalarLogical(NA_LOGICAL); } else { return Rf_ScalarLogical(R_altrep_data1(x_sexp) == R_NilValue); } } SEXP nanoarrow_c_altrep_force_materialize(SEXP x_sexp, SEXP recursive_sexp) { // The recursive flag lets a developer/user force materialization of any // string columns in a data.frame that came from nanoarrow. if (Rf_inherits(x_sexp, "data.frame") && LOGICAL(recursive_sexp)[0]) { int n_materialized = 0; for (R_xlen_t i = 0; i < Rf_xlength(x_sexp); i++) { SEXP n_materialized_sexp = PROTECT( nanoarrow_c_altrep_force_materialize(VECTOR_ELT(x_sexp, i), recursive_sexp)); n_materialized += INTEGER(n_materialized_sexp)[0]; UNPROTECT(1); } return Rf_ScalarInteger(n_materialized); } const char* class_name = nanoarrow_altrep_class(x_sexp); if (class_name && strcmp(class_name, "nanoarrow::altrep_chr") == 0) { // Force materialization even if already materialized (the method // should be safe to call more than once as written here) int already_materialized = R_altrep_data1(x_sexp) == R_NilValue; nanoarrow_altstring_materialize(x_sexp); return Rf_ScalarInteger(!already_materialized); } else { return Rf_ScalarInteger(0); } } nanoarrow/src/util.h0000644000176200001440000000447714547575511014227 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_UTIL_H_INCLUDED #define R_UTIL_H_INCLUDED #include #include #include extern SEXP nanoarrow_ns_pkg; extern SEXP nanoarrow_cls_array; extern SEXP nanoarrow_cls_altrep_chr; extern SEXP nanoarrow_cls_array_view; extern SEXP nanoarrow_cls_data_frame; extern SEXP nanoarrow_cls_schema; extern SEXP nanoarrow_cls_array_stream; extern SEXP nanoarrow_cls_buffer; void nanoarrow_init_cached_sexps(void); // Internal abstractions for R_PreserveObject and R_ReleaseObject // that provide an opportunity for debugging information about // preserved object lifecycle and possible future optimizations. // These implementations use C++ and live in nanoarrow_cpp.cc void nanoarrow_preserve_init(void); void nanoarrow_preserve_sexp(SEXP obj); void nanoarrow_release_sexp(SEXP obj); int64_t nanoarrow_preserved_count(void); int64_t nanoarrow_preserved_empty(void); int nanoarrow_is_main_thread(void); // For testing void nanoarrow_preserve_and_release_on_other_thread(SEXP obj); // Checker for very small mallocs() static inline void check_trivial_alloc(const void* ptr, const char* ptr_type) { if (ptr == NULL) { Rf_error("ArrowMalloc(sizeof(%s)) failed", ptr_type); // # nocov } } // So that lengths >INT_MAX do not overflow an INTSXP. Most places // in R return an integer length except for lengths where this is not // possible. static inline SEXP length_sexp_from_int64(int64_t value) { if (value < INT_MAX) { return Rf_ScalarInteger((int)value); } else { return Rf_ScalarReal((double)value); } } #endif nanoarrow/src/buffer.h0000644000176200001440000000716514547575511014520 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_NANOARROW_BUFFER_H_INCLUDED #define R_NANOARROW_BUFFER_H_INCLUDED #include #include #include "nanoarrow.h" #include "util.h" void finalize_buffer_xptr(SEXP buffer_xptr); void nanoarrow_sexp_deallocator(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size); // Create an external pointer with the proper class and that will release any // non-null, non-released pointer when garbage collected. static inline SEXP buffer_owning_xptr(void) { struct ArrowBuffer* buffer = (struct ArrowBuffer*)ArrowMalloc(sizeof(struct ArrowBuffer)); ArrowBufferInit(buffer); SEXP buffer_xptr = PROTECT(R_MakeExternalPtr(buffer, R_NilValue, R_NilValue)); Rf_setAttrib(buffer_xptr, R_ClassSymbol, nanoarrow_cls_buffer); R_RegisterCFinalizer(buffer_xptr, &finalize_buffer_xptr); UNPROTECT(1); return buffer_xptr; } // Create an arrow_buffer with a deallocator that will release shelter when // the buffer is no longer needed. static inline void buffer_borrowed(struct ArrowBuffer* buffer, const void* addr, int64_t size_bytes, SEXP shelter) { buffer->allocator = ArrowBufferDeallocator(&nanoarrow_sexp_deallocator, shelter); buffer->data = (uint8_t*)addr; buffer->size_bytes = size_bytes; buffer->capacity_bytes = size_bytes; nanoarrow_preserve_sexp(shelter); } static inline SEXP buffer_borrowed_xptr(const void* addr, int64_t size_bytes, SEXP shelter) { SEXP buffer_xptr = PROTECT(buffer_owning_xptr()); // Don't bother with a preserve/release if the buffer is NULL if (addr == NULL) { UNPROTECT(1); return buffer_xptr; } struct ArrowBuffer* buffer = (struct ArrowBuffer*)R_ExternalPtrAddr(buffer_xptr); buffer_borrowed(buffer, addr, size_bytes, shelter); UNPROTECT(1); return buffer_xptr; } static inline void buffer_borrowed_xptr_set_type(SEXP buffer_xptr, enum ArrowBufferType buffer_type, enum ArrowType buffer_data_type, int64_t element_size_bits) { SEXP buffer_types_sexp = PROTECT(Rf_allocVector(INTSXP, 3)); INTEGER(buffer_types_sexp)[0] = buffer_type; INTEGER(buffer_types_sexp)[1] = buffer_data_type; INTEGER(buffer_types_sexp)[2] = (int32_t)element_size_bits; R_SetExternalPtrTag(buffer_xptr, buffer_types_sexp); UNPROTECT(1); } static inline struct ArrowBuffer* buffer_from_xptr(SEXP buffer_xptr) { if (!Rf_inherits(buffer_xptr, "nanoarrow_buffer")) { Rf_error("`buffer` argument that is not a nanoarrow_buffer()"); } struct ArrowBuffer* buffer = (struct ArrowBuffer*)R_ExternalPtrAddr(buffer_xptr); if (buffer == NULL) { Rf_error("nanoarrow_buffer is an external pointer to NULL"); } return buffer; } #endif nanoarrow/src/materialize_int.h0000644000176200001440000001145514547575511016424 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_INT_H_INCLUDED #define R_MATERIALIZE_INT_H_INCLUDED #include #include #include "materialize_common.h" #include "nanoarrow.h" static inline int nanoarrow_materialize_int(struct ArrayViewSlice* src, struct VectorSlice* dst, struct MaterializeOptions* options) { if (src->array_view->array->dictionary != NULL) { return ENOTSUP; } int* result = INTEGER(dst->vec_sexp); int64_t n_bad_values = 0; // True for all the types supported here const uint8_t* is_valid = src->array_view->buffer_views[0].data.as_uint8; int64_t raw_src_offset = src->array_view->array->offset + src->offset; // Fill the buffer switch (src->array_view->storage_type) { case NANOARROW_TYPE_NA: for (R_xlen_t i = 0; i < dst->length; i++) { result[dst->offset + i] = NA_INTEGER; } break; case NANOARROW_TYPE_INT32: memcpy(result + dst->offset, src->array_view->buffer_views[1].data.as_int32 + raw_src_offset, dst->length * sizeof(int32_t)); // Set any nulls to NA_INTEGER if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[dst->offset + i] = NA_INTEGER; } } } break; case NANOARROW_TYPE_BOOL: ArrowBitsUnpackInt32( src->array_view->buffer_views[1].data.as_uint8 + raw_src_offset, raw_src_offset, dst->length, result + dst->offset); // Set any nulls to NA_LOGICAL if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[dst->offset + i] = NA_LOGICAL; } } } break; case NANOARROW_TYPE_INT8: case NANOARROW_TYPE_UINT8: case NANOARROW_TYPE_INT16: case NANOARROW_TYPE_UINT16: // No need to bounds check for these types for (R_xlen_t i = 0; i < dst->length; i++) { result[dst->offset + i] = (int32_t)ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i); } // Set any nulls to NA_INTEGER if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[dst->offset + i] = NA_INTEGER; } } } break; case NANOARROW_TYPE_UINT32: case NANOARROW_TYPE_INT64: case NANOARROW_TYPE_UINT64: case NANOARROW_TYPE_FLOAT: case NANOARROW_TYPE_DOUBLE: // Loop + bounds check. Because we don't know what memory might be // in a null slot, we have to check nulls if there are any. if (is_valid != NULL && src->array_view->array->null_count != 0) { for (R_xlen_t i = 0; i < dst->length; i++) { if (ArrowBitGet(is_valid, raw_src_offset + i)) { int64_t value = ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i); if (value > INT_MAX || value <= NA_INTEGER) { result[dst->offset + i] = NA_INTEGER; n_bad_values++; } else { result[dst->offset + i] = (int32_t)value; } } else { result[dst->offset + i] = NA_INTEGER; } } } else { for (R_xlen_t i = 0; i < dst->length; i++) { int64_t value = ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i); if (value > INT_MAX || value <= NA_INTEGER) { result[dst->offset + i] = NA_INTEGER; n_bad_values++; } else { result[dst->offset + i] = (int32_t)value; } } } break; default: return EINVAL; } if (n_bad_values > 0) { warn_lossy_conversion(n_bad_values, "outside integer range set to NA"); } return NANOARROW_OK; } #endif nanoarrow/src/buffer.c0000644000176200001440000001527214547575511014511 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include #include "buffer.h" #include "nanoarrow.h" void finalize_buffer_xptr(SEXP buffer_xptr) { struct ArrowBuffer* buffer = (struct ArrowBuffer*)R_ExternalPtrAddr(buffer_xptr); if (buffer != NULL) { ArrowBufferReset(buffer); ArrowFree(buffer); } } void nanoarrow_sexp_deallocator(struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t size) { nanoarrow_release_sexp((SEXP)allocator->private_data); } SEXP nanoarrow_c_as_buffer_default(SEXP x_sexp) { R_xlen_t len = Rf_xlength(x_sexp); const void* data = NULL; int64_t size_bytes = 0; int32_t element_size_bits = 0; enum ArrowType buffer_data_type = NANOARROW_TYPE_UNINITIALIZED; // For non-NA character(1), we use the first element if (TYPEOF(x_sexp) == STRSXP && len == 1) { return nanoarrow_c_as_buffer_default(STRING_ELT(x_sexp, 0)); } switch (TYPEOF(x_sexp)) { case NILSXP: data = NULL; break; case RAWSXP: case LGLSXP: case INTSXP: case REALSXP: case CPLXSXP: data = DATAPTR_RO(x_sexp); break; case CHARSXP: if (x_sexp != NA_STRING) { data = CHAR(x_sexp); break; } else { Rf_error("NA_character_ not supported in as_nanoarrow_buffer()"); } break; default: Rf_error("Unsupported type"); } switch (TYPEOF(x_sexp)) { case NILSXP: case RAWSXP: buffer_data_type = NANOARROW_TYPE_BINARY; size_bytes = len; element_size_bits = 8; break; case LGLSXP: case INTSXP: buffer_data_type = NANOARROW_TYPE_INT32; size_bytes = len * sizeof(int); element_size_bits = 8 * sizeof(int); break; case REALSXP: buffer_data_type = NANOARROW_TYPE_DOUBLE; size_bytes = len * sizeof(double); element_size_bits = 8 * sizeof(double); break; case CPLXSXP: buffer_data_type = NANOARROW_TYPE_DOUBLE; size_bytes = len * 2 * sizeof(double); element_size_bits = 8 * sizeof(double); break; case CHARSXP: buffer_data_type = NANOARROW_TYPE_STRING; size_bytes = Rf_xlength(x_sexp); element_size_bits = 8; break; default: break; } // Don't bother borrowing a zero-size buffer SEXP buffer_xptr; if (size_bytes == 0) { buffer_xptr = PROTECT(buffer_owning_xptr()); } else { buffer_xptr = PROTECT(buffer_borrowed_xptr(data, size_bytes, x_sexp)); } buffer_borrowed_xptr_set_type(buffer_xptr, NANOARROW_BUFFER_TYPE_DATA, buffer_data_type, element_size_bits); UNPROTECT(1); return buffer_xptr; } SEXP nanoarrow_c_buffer_append(SEXP buffer_xptr, SEXP new_buffer_xptr) { struct ArrowBuffer* buffer = buffer_from_xptr(buffer_xptr); struct ArrowBuffer* new_buffer = buffer_from_xptr(new_buffer_xptr); int result = ArrowBufferAppend(buffer, new_buffer->data, new_buffer->size_bytes); if (result != NANOARROW_OK) { Rf_error("ArrowBufferAppend() failed"); } return R_NilValue; } SEXP nanoarrow_c_buffer_info(SEXP buffer_xptr) { struct ArrowBuffer* buffer = buffer_from_xptr(buffer_xptr); SEXP buffer_types_sexp = R_ExternalPtrTag(buffer_xptr); SEXP buffer_type_sexp; SEXP buffer_data_type_sexp; int32_t element_size_bits; if (buffer_types_sexp == R_NilValue) { buffer_type_sexp = PROTECT(Rf_mkString("unknown")); buffer_data_type_sexp = PROTECT(Rf_mkString("unknown")); element_size_bits = 0; } else { enum ArrowBufferType buffer_type = INTEGER(buffer_types_sexp)[0]; const char* buffer_type_string; switch (buffer_type) { case NANOARROW_BUFFER_TYPE_VALIDITY: buffer_type_string = "validity"; break; case NANOARROW_BUFFER_TYPE_DATA_OFFSET: buffer_type_string = "data_offset"; break; case NANOARROW_BUFFER_TYPE_DATA: buffer_type_string = "data"; break; case NANOARROW_BUFFER_TYPE_TYPE_ID: buffer_type_string = "type_id"; break; case NANOARROW_BUFFER_TYPE_UNION_OFFSET: buffer_type_string = "union_offset"; break; default: buffer_type_string = "unknown"; break; } enum ArrowType buffer_data_type = INTEGER(buffer_types_sexp)[1]; const char* buffer_data_type_string = ArrowTypeString(buffer_data_type); buffer_type_sexp = PROTECT(Rf_mkString(buffer_type_string)); buffer_data_type_sexp = PROTECT(Rf_mkString(buffer_data_type_string)); element_size_bits = INTEGER(buffer_types_sexp)[2]; } const char* names[] = {"data", "size_bytes", "capacity_bytes", "type", "data_type", "element_size_bits", ""}; SEXP info = PROTECT(Rf_mkNamed(VECSXP, names)); SET_VECTOR_ELT(info, 0, R_MakeExternalPtr(buffer->data, NULL, buffer_xptr)); SET_VECTOR_ELT(info, 1, Rf_ScalarReal((double)buffer->size_bytes)); SET_VECTOR_ELT(info, 2, Rf_ScalarReal((double)buffer->capacity_bytes)); SET_VECTOR_ELT(info, 3, buffer_type_sexp); SET_VECTOR_ELT(info, 4, buffer_data_type_sexp); SET_VECTOR_ELT(info, 5, Rf_ScalarInteger(element_size_bits)); UNPROTECT(3); return info; } SEXP nanoarrow_c_buffer_head_bytes(SEXP buffer_xptr, SEXP max_bytes_sexp) { struct ArrowBuffer* buffer = buffer_from_xptr(buffer_xptr); int64_t max_bytes = (int64_t)REAL(max_bytes_sexp)[0]; if (buffer->size_bytes <= max_bytes) { return buffer_xptr; } SEXP buffer_clone_xptr = PROTECT(buffer_borrowed_xptr(buffer->data, max_bytes, buffer_xptr)); R_SetExternalPtrTag(buffer_clone_xptr, Rf_duplicate(R_ExternalPtrTag(buffer_xptr))); UNPROTECT(1); return buffer_clone_xptr; } SEXP nanoarrow_c_buffer_as_raw(SEXP buffer_xptr) { struct ArrowBuffer* buffer = buffer_from_xptr(buffer_xptr); SEXP result = PROTECT(Rf_allocVector(RAWSXP, buffer->size_bytes)); if (buffer->size_bytes > 0) { memcpy(RAW(result), buffer->data, buffer->size_bytes); } UNPROTECT(1); return result; } nanoarrow/src/materialize_common.h0000644000176200001440000000700414547575511017115 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_COMMON_H_INCLUDED #define R_MATERIALIZE_COMMON_H_INCLUDED #include #include #include "nanoarrow.h" #include "util.h" // Vector types that have some special casing internally to avoid unnecessary allocations // or looping at the R level. Some of these types also need an SEXP ptype to communicate // additional information. enum VectorType { VECTOR_TYPE_UNINITIALIZED, VECTOR_TYPE_NULL, VECTOR_TYPE_UNSPECIFIED, VECTOR_TYPE_LGL, VECTOR_TYPE_INT, VECTOR_TYPE_DBL, VECTOR_TYPE_ALTREP_CHR, VECTOR_TYPE_CHR, VECTOR_TYPE_POSIXCT, VECTOR_TYPE_DATE, VECTOR_TYPE_DIFFTIME, VECTOR_TYPE_INTEGER64, VECTOR_TYPE_BLOB, VECTOR_TYPE_LIST_OF, VECTOR_TYPE_DATA_FRAME, VECTOR_TYPE_OTHER }; // More easily switch()able version of attr(difftime_obj, "units") enum RTimeUnits { R_TIME_UNIT_SECONDS, R_TIME_UNIT_MINUTES, R_TIME_UNIT_HOURS, R_TIME_UNIT_DAYS, R_TIME_UNIT_WEEKS }; // A "parsed" version of an SEXP ptype (like a SchemaView but for // R objects)) struct PTypeView { enum VectorType vector_type; int sexp_type; enum RTimeUnits r_time_units; SEXP ptype; }; // A wrapper around the ArrayView with an additional offset + length // representing a source of a materialization struct ArrayViewSlice { struct ArrowArrayView* array_view; int64_t offset; int64_t length; }; // A wapper around an SEXP vector with an additional offset + length. // This can be both a source and/or a target for copying from/to. struct VectorSlice { SEXP vec_sexp; R_xlen_t offset; R_xlen_t length; }; // Options for resolving a ptype and for materializing values. These are // currently unused but this struct is a placeholder for them when they // are implemented. struct MaterializeOptions { double scale; }; // A house for a conversion operation (i.e., zero or more arrays // getting converted into an R vector)). The structure of this // may change in the future but the API below should be relatively stable. // This is typically accessed via the external pointer whose API is defined // in convert.h struct RConverter { struct PTypeView ptype_view; struct ArrowSchemaView schema_view; struct ArrowArrayView array_view; struct ArrayViewSlice src; struct VectorSlice dst; struct MaterializeOptions* options; struct ArrowError error; R_xlen_t size; R_xlen_t capacity; R_xlen_t n_children; struct RConverter** children; }; static inline void warn_lossy_conversion(int64_t count, const char* msg) { SEXP fun = PROTECT(Rf_install("warn_lossy_conversion")); SEXP count_sexp = PROTECT(Rf_ScalarReal((double)count)); SEXP msg_sexp = PROTECT(Rf_mkString(msg)); SEXP call = PROTECT(Rf_lang3(fun, count_sexp, msg_sexp)); Rf_eval(call, nanoarrow_ns_pkg); UNPROTECT(4); } #endif nanoarrow/src/convert.h0000644000176200001440000000534614502402562014710 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_CONVERT_H_INCLUDED #define R_CONVERT_H_INCLUDED #include #include #include "nanoarrow.h" #include "materialize.h" // Create and initialize a converter. A converter's output R vector type // never changes once it has been created. SEXP nanoarrow_converter_from_type(enum VectorType vector_type); SEXP nanoarrow_converter_from_ptype(SEXP ptype); // Set the schema for the next array that will be materialized into // the R vector. In theory this could change although this has not been // implemented. This will also validate the schema. Returns an errno code. int nanoarrow_converter_set_schema(SEXP converter_xptr, SEXP schema_xptr); // Set the array target. This will also validate the array against the last // schema that was set. Returns an errno code. int nanoarrow_converter_set_array(SEXP converter_xptr, SEXP array_xptr); // Reserve space in the R vector output for additional elements. In theory // this could be used to provide growable behaviour; however, this is not // implemented. Returns an errno code. int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size); // Materialize the next n elements into the output. Returns the number of elements // that were actually materialized which may be less than n. R_xlen_t nanoarrow_converter_materialize_n(SEXP converter_xptr, R_xlen_t n); // Materialize the entire array into the output. Returns an errno code. int nanoarrow_converter_materialize_all(SEXP converter_xptr); // Finalize the output. Currently this just validates the length of the // output. Returns an errno code. int nanoarrow_converter_finalize(SEXP converter_xptr); // Returns the resulting SEXP and moves the result out of the protection // of the converter. SEXP nanoarrow_converter_release_result(SEXP converter_xptr); // Calls Rf_error() with the internal error buffer populated by above calls // that return a non-zero errno value. void nanoarrow_converter_stop(SEXP converter_xptr); #endif nanoarrow/src/array_stream.c0000644000176200001440000002075514547575511015733 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include "array.h" #include "array_stream.h" #include "nanoarrow.h" #include "schema.h" #include "util.h" // Ideally user-supplied finalizers are written in such a way that they don't jump; // however if they do it is likely that memory will leak. Here, we use // R_tryCatchError to minimize the chances of that happening. static SEXP run_finalizer_wrapper(void* data) { SEXP finalizer_sym = PROTECT(Rf_install("array_stream_finalizer")); SEXP finalizer_call = PROTECT(Rf_lang1(finalizer_sym)); Rf_eval(finalizer_call, (SEXP)data); UNPROTECT(2); return R_NilValue; } static SEXP run_finalizer_error_handler(SEXP cond, void* hdata) { REprintf("Error evaluating user-supplied array stream finalizer"); return R_NilValue; } static void run_user_array_stream_finalizer(SEXP array_stream_xptr) { SEXP protected = PROTECT(R_ExternalPtrProtected(array_stream_xptr)); R_SetExternalPtrProtected(array_stream_xptr, R_NilValue); if (Rf_inherits(protected, "nanoarrow_array_stream_finalizer")) { R_tryCatchError(&run_finalizer_wrapper, protected, &run_finalizer_error_handler, NULL); } UNPROTECT(1); } SEXP nanoarrow_c_array_stream_get_schema(SEXP array_stream_xptr) { struct ArrowArrayStream* array_stream = nanoarrow_array_stream_from_xptr(array_stream_xptr); SEXP schema_xptr = PROTECT(nanoarrow_schema_owning_xptr()); struct ArrowSchema* schema = nanoarrow_output_schema_from_xptr(schema_xptr); int result = ArrowArrayStreamGetSchema(array_stream, schema, NULL); if (result != 0) { Rf_error("array_stream->get_schema(): [%d] %s", result, ArrowArrayStreamGetLastError(array_stream)); } UNPROTECT(1); return schema_xptr; } SEXP nanoarrow_c_array_stream_get_next(SEXP array_stream_xptr) { struct ArrowArrayStream* array_stream = nanoarrow_array_stream_from_xptr(array_stream_xptr); SEXP array_xptr = PROTECT(nanoarrow_array_owning_xptr()); struct ArrowArray* array = nanoarrow_output_array_from_xptr(array_xptr); int result = ArrowArrayStreamGetNext(array_stream, array, NULL); if (result != NANOARROW_OK) { Rf_error("array_stream->get_next(): [%d] %s", result, ArrowArrayStreamGetLastError(array_stream)); } UNPROTECT(1); return array_xptr; } SEXP nanoarrow_c_basic_array_stream(SEXP batches_sexp, SEXP schema_xptr, SEXP validate_sexp) { int validate = LOGICAL(validate_sexp)[0]; // Schema needs a copy here because ArrowBasicArrayStreamInit() takes ownership SEXP schema_copy_xptr = PROTECT(nanoarrow_schema_owning_xptr()); struct ArrowSchema* schema_copy = nanoarrow_output_schema_from_xptr(schema_copy_xptr); schema_export(schema_xptr, schema_copy); SEXP array_stream_xptr = PROTECT(nanoarow_array_stream_owning_xptr()); struct ArrowArrayStream* array_stream = nanoarrow_output_array_stream_from_xptr(array_stream_xptr); int64_t n_arrays = Rf_xlength(batches_sexp); if (ArrowBasicArrayStreamInit(array_stream, schema_copy, n_arrays) != NANOARROW_OK) { Rf_error("Failed to initialize array stream"); } struct ArrowArray array; for (int64_t i = 0; i < n_arrays; i++) { array_export(VECTOR_ELT(batches_sexp, i), &array); ArrowBasicArrayStreamSetArray(array_stream, i, &array); } if (validate) { struct ArrowError error; if (ArrowBasicArrayStreamValidate(array_stream, &error) != NANOARROW_OK) { Rf_error("ArrowBasicArrayStreamValidate(): %s", ArrowErrorMessage(&error)); } } UNPROTECT(2); return array_stream_xptr; } SEXP nanoarrow_c_array_list_total_length(SEXP list_of_array_xptr) { int64_t total_length = 0; R_xlen_t num_chunks = Rf_xlength(list_of_array_xptr); for (R_xlen_t i = 0; i < num_chunks; i++) { struct ArrowArray* chunk = (struct ArrowArray*)R_ExternalPtrAddr(VECTOR_ELT(list_of_array_xptr, i)); total_length += chunk->length; } return length_sexp_from_int64(total_length); } // Implementation of an ArrowArrayStream that keeps a dependent object valid struct WrapperArrayStreamData { SEXP parent_array_stream_xptr; struct ArrowArrayStream* parent_array_stream; }; static void finalize_wrapper_array_stream(struct ArrowArrayStream* array_stream) { if (array_stream->private_data != NULL) { struct WrapperArrayStreamData* data = (struct WrapperArrayStreamData*)array_stream->private_data; // Run the parent array stream release callback data->parent_array_stream->release(data->parent_array_stream); // If safe to do so, attempt to do an eager evaluation of a release // callback that may have been registered. If it is not safe to do so, // garbage collection will run any finalizers that have been set // on the chain of environments leading up to the finalizer. if (nanoarrow_is_main_thread()) { run_user_array_stream_finalizer(data->parent_array_stream_xptr); } nanoarrow_release_sexp(data->parent_array_stream_xptr); ArrowFree(array_stream->private_data); } array_stream->release = NULL; } static const char* wrapper_array_stream_get_last_error( struct ArrowArrayStream* array_stream) { struct WrapperArrayStreamData* data = (struct WrapperArrayStreamData*)array_stream->private_data; return data->parent_array_stream->get_last_error(data->parent_array_stream); } static int wrapper_array_stream_get_schema(struct ArrowArrayStream* array_stream, struct ArrowSchema* out) { struct WrapperArrayStreamData* data = (struct WrapperArrayStreamData*)array_stream->private_data; return data->parent_array_stream->get_schema(data->parent_array_stream, out); } static int wrapper_array_stream_get_next(struct ArrowArrayStream* array_stream, struct ArrowArray* out) { struct WrapperArrayStreamData* data = (struct WrapperArrayStreamData*)array_stream->private_data; return data->parent_array_stream->get_next(data->parent_array_stream, out); } void array_stream_export(SEXP parent_array_stream_xptr, struct ArrowArrayStream* array_stream_copy) { struct ArrowArrayStream* parent_array_stream = nanoarrow_array_stream_from_xptr(parent_array_stream_xptr); // If there is no dependent object, don't bother with this wrapper SEXP dependent_sexp = R_ExternalPtrProtected(parent_array_stream_xptr); if (dependent_sexp == R_NilValue) { ArrowArrayStreamMove(parent_array_stream, array_stream_copy); return; } // Allocate a new external pointer for an array stream (for consistency: // we always move an array stream when exporting) SEXP parent_array_stream_xptr_new = PROTECT(nanoarow_array_stream_owning_xptr()); struct ArrowArrayStream* parent_array_stream_new = (struct ArrowArrayStream*)R_ExternalPtrAddr(parent_array_stream_xptr_new); ArrowArrayStreamMove(parent_array_stream, parent_array_stream_new); R_SetExternalPtrProtected(parent_array_stream_xptr_new, dependent_sexp); array_stream_copy->private_data = NULL; array_stream_copy->get_last_error = &wrapper_array_stream_get_last_error; array_stream_copy->get_schema = &wrapper_array_stream_get_schema; array_stream_copy->get_next = &wrapper_array_stream_get_next; array_stream_copy->release = &finalize_wrapper_array_stream; struct WrapperArrayStreamData* data = (struct WrapperArrayStreamData*)ArrowMalloc(sizeof(struct WrapperArrayStreamData)); check_trivial_alloc(data, "struct WrapperArrayStreamData"); data->parent_array_stream_xptr = parent_array_stream_xptr_new; data->parent_array_stream = parent_array_stream_new; array_stream_copy->private_data = data; // Transfer responsibility for the stream_xptr to the C object nanoarrow_preserve_sexp(parent_array_stream_xptr_new); UNPROTECT(1); } nanoarrow/src/materialize_difftime.h0000644000176200001440000000520014355121773017402 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_DIFFTIME_H_INCLUDED #define R_MATERIALIZE_DIFFTIME_H_INCLUDED #include #include #include "materialize_common.h" #include "materialize_dbl.h" #include "nanoarrow.h" static inline int nanoarrow_materialize_difftime(struct RConverter* converter) { if (converter->ptype_view.sexp_type == REALSXP) { switch (converter->schema_view.type) { case NANOARROW_TYPE_NA: NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_dbl(converter)); return NANOARROW_OK; case NANOARROW_TYPE_TIME32: case NANOARROW_TYPE_TIME64: case NANOARROW_TYPE_DURATION: NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_dbl(converter)); break; default: return EINVAL; } double scale; switch (converter->ptype_view.r_time_units) { case R_TIME_UNIT_MINUTES: scale = 1.0 / 60; break; case R_TIME_UNIT_HOURS: scale = 1.0 / (60 * 60); break; case R_TIME_UNIT_DAYS: scale = 1.0 / (60 * 60 * 24); break; case R_TIME_UNIT_WEEKS: scale = 1.0 / (60 * 60 * 24 * 7); break; default: scale = 1.0; break; } switch (converter->schema_view.time_unit) { case NANOARROW_TIME_UNIT_SECOND: scale *= 1; break; case NANOARROW_TIME_UNIT_MILLI: scale *= 1e-3; break; case NANOARROW_TIME_UNIT_MICRO: scale *= 1e-6; break; case NANOARROW_TIME_UNIT_NANO: scale *= 1e-9; break; default: return EINVAL; } if (scale != 1) { double* result = REAL(converter->dst.vec_sexp); for (int64_t i = 0; i < converter->dst.length; i++) { result[converter->dst.offset + i] = result[converter->dst.offset + i] * scale; } } return NANOARROW_OK; } return EINVAL; } #endif nanoarrow/src/altrep.h0000644000176200001440000000365114355103326014516 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_ALTREP_H_INCLUDED #define R_ALTREP_H_INCLUDED #include "Rversion.h" #include // ALTREP available in R >= 3.5 #if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0) #define HAS_ALTREP #include // Returns the ALTREP class name or NULL if x is not an altrep // object. static inline const char* nanoarrow_altrep_class(SEXP x) { if (ALTREP(x)) { SEXP data_class_sym = CAR(ATTRIB(ALTREP_CLASS(x))); return CHAR(PRINTNAME(data_class_sym)); } else { return NULL; } } #else static inline const char* nanoarrow_altrep_class(SEXP x) { return NULL; } #endif // Performs the ALTREP type registration and should be called on package load void register_nanoarrow_altrep(DllInfo* info); // Checks if an object is an ALTREP object created by this package static inline int is_nanoarrow_altrep(SEXP x) { const char* class_name = nanoarrow_altrep_class(x); return class_name && strncmp(class_name, "nanoarrow::", 11) == 0; } // Creates an altstring vector backed by a nanoarrow array or returns // R_NilValue if the conversion is not possible. SEXP nanoarrow_c_make_altrep_chr(SEXP array_xptr); #endif nanoarrow/src/materialize.h0000644000176200001440000000347014377444470015550 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef R_MATERIALIZE_H_INCLUDED #define R_MATERIALIZE_H_INCLUDED #include #include #include "materialize_common.h" // A heuristic to identify prototypes that should be treated like data frames // (i.e., including record-style vectors like POSIXct). This heuristic returns // true if ptype is a data.frame or is an S3 list with names. int nanoarrow_ptype_is_data_frame(SEXP ptype); // Returns the number of rows in a data.frame in a way that is least likely to // expand the attr(x, "row.names") R_xlen_t nanoarrow_data_frame_size(SEXP x); // Set rownames of a data.frame (with special handling if len > INT_MAX) void nanoarrow_set_rownames(SEXP x, R_xlen_t len); // Perform actual materializing of values (e.g., loop through buffers) int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr); // Shortcut to allocate a vector based on a vector type or ptype SEXP nanoarrow_alloc_type(enum VectorType vector_type, R_xlen_t len); SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len); #endif nanoarrow/src/nanoarrow_cpp.cc0000644000176200001440000001361114502402506016226 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #define R_NO_REMAP #include #include #include #include #include #include #include #include // Without this infrastructure, it's possible to check that all objects // are released by running devtools::test(); gc() in a fresh session and // making sure that nanoarrow:::preserved_count() is zero afterward. // When this isn't the case the process of debugging unreleased SEXPs // is almost impossible without the bookkeeping below. #if defined(NANOARROW_DEBUG_PRESERVE) #include #endif extern "C" void intptr_as_string(intptr_t ptr_int, char* buf) { std::string ptr_str = std::to_string(ptr_int); memcpy(buf, ptr_str.data(), ptr_str.size()); } #if defined(NANOARROW_DEBUG_PRESERVE) static std::string get_r_traceback(void) { SEXP fun = PROTECT(Rf_install("current_stack_trace_chr")); SEXP call = PROTECT(Rf_lang1(fun)); SEXP nanoarrow_str = PROTECT(Rf_mkString("nanoarrow")); SEXP nanoarrow_ns = PROTECT(R_FindNamespace(nanoarrow_str)); SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns)); const char* traceback_chr = Rf_translateCharUTF8(STRING_ELT(result, 0)); std::string traceback_str(traceback_chr); UNPROTECT(5); return traceback_str; } #endif class PreservedSEXPRegistry { public: PreservedSEXPRegistry() : preserved_count_(0), main_thread_id_(std::this_thread::get_id()) {} int64_t size() { return preserved_count_; } bool is_main_thread() { return std::this_thread::get_id() == main_thread_id_; } void preserve(SEXP obj) { if (obj == R_NilValue) { return; } #if defined(NANOARROW_DEBUG_PRESERVE) Rprintf("PreservedSEXPRegistry::preserve(%p)\n", obj); #endif R_PreserveObject(obj); preserved_count_++; #if defined(NANOARROW_DEBUG_PRESERVE) if (tracebacks_.find(obj) != tracebacks_.end()) { tracebacks_[obj].first++; } else { tracebacks_[obj] = {1, get_r_traceback()}; } #endif } bool release(SEXP obj) { if (obj == R_NilValue) { return true; } #if defined(NANOARROW_DEBUG_PRESERVE) Rprintf("PreservedSEXPRegistry::release(%p)\n", obj); #endif // If there is an attempt to delete this object from another thread, // R_ReleaseObject() will almost certainly crash R or corrupt memory // leading to confusing errors. Instead, save a reference to the object // and provide an opportunity to delete it later. if (std::this_thread::get_id() != main_thread_id_) { std::lock_guard lock(trash_can_lock_); trash_can_.push_back(obj); return false; } else { R_ReleaseObject(obj); preserved_count_--; #if defined(NANOARROW_DEBUG_PRESERVE) if (tracebacks_.find(obj) != tracebacks_.end()) { tracebacks_[obj].first--; if (tracebacks_[obj].first == 0) { tracebacks_.erase(obj); } } #endif return true; } } int64_t empty_trash() { std::lock_guard lock(trash_can_lock_); int64_t trash_size = trash_can_.size(); for (SEXP obj : trash_can_) { R_ReleaseObject(obj); preserved_count_--; #if defined(NANOARROW_DEBUG_PRESERVE) if (tracebacks_.find(obj) != tracebacks_.end()) { tracebacks_[obj].first--; if (tracebacks_[obj].first == 0) { tracebacks_.erase(obj); } } #endif } trash_can_.clear(); #if defined(NANOARROW_DEBUG_PRESERVE) if (preserved_count_ > 0) { Rprintf("%ld unreleased SEXP(s) after emptying the trash:\n", (long)preserved_count_); for (const auto& item : tracebacks_) { Rprintf("----%p---- (%ld reference(s) remaining)\nFirst preserved at\n%s\n\n", item.first, item.second.first, item.second.second.c_str()); } } #endif return trash_size; } static PreservedSEXPRegistry& GetInstance() { static PreservedSEXPRegistry singleton; return singleton; } private: int64_t preserved_count_; std::thread::id main_thread_id_; std::vector trash_can_; std::mutex trash_can_lock_; #if defined(NANOARROW_DEBUG_PRESERVE) std::unordered_map> tracebacks_; #endif }; extern "C" void nanoarrow_preserve_init(void) { PreservedSEXPRegistry::GetInstance(); } extern "C" void nanoarrow_preserve_sexp(SEXP obj) { PreservedSEXPRegistry::GetInstance().preserve(obj); } extern "C" void nanoarrow_release_sexp(SEXP obj) { try { PreservedSEXPRegistry::GetInstance().release(obj); } catch (std::exception& e) { // Just for safety...we really don't want to crash here } } extern "C" int64_t nanoarrow_preserved_count(void) { return PreservedSEXPRegistry::GetInstance().size(); } extern "C" int64_t nanoarrow_preserved_empty(void) { try { return PreservedSEXPRegistry::GetInstance().empty_trash(); } catch (std::exception& e) { return 0; } } extern "C" int nanoarrow_is_main_thread(void) { return PreservedSEXPRegistry::GetInstance().is_main_thread(); } extern "C" void nanoarrow_preserve_and_release_on_other_thread(SEXP obj) { nanoarrow_preserve_sexp(obj); std::thread worker([obj] { nanoarrow_release_sexp(obj); }); worker.join(); } nanoarrow/configure.win0000755000176200001440000000151014410356632014764 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # Just call the original configure script ./configure nanoarrow/R/0000755000176200001440000000000014547575511012477 5ustar liggesusersnanoarrow/R/array-stream.R0000644000176200001440000002116314547575511015234 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Create ArrayStreams from batches #' #' @param batches A [list()] of [nanoarrow_array][as_nanoarrow_array] objects #' or objects that can be coerced via [as_nanoarrow_array()]. #' @param schema A [nanoarrow_schema][as_nanoarrow_schema] or `NULL` to guess #' based on the first schema. #' @param validate Use `FALSE` to skip the validation step (i.e., if you #' know that the arrays are valid). #' #' @return An [nanoarrow_array_stream][as_nanoarrow_array_stream] #' @export #' #' @examples #' (stream <- basic_array_stream(list(data.frame(a = 1, b = 2)))) #' as.data.frame(stream$get_next()) #' stream$get_next() #' basic_array_stream <- function(batches, schema = NULL, validate = TRUE) { # Error for everything except a bare list (e.g., so that calling with # a data.frame() does not unintentionally loop over columns) if (!identical(class(batches), "list")) { stop("`batches` must be an unclassed `list()`") } batches <- lapply(batches, as_nanoarrow_array, schema = schema) if (is.null(schema) && length(batches) > 0) { schema <- infer_nanoarrow_schema(batches[[1]]) } else if (is.null(schema)) { stop("Can't infer schema from first batch if there are zero batches") } .Call(nanoarrow_c_basic_array_stream, batches, schema, validate) } #' Register an array stream finalizer #' #' In some cases, R functions that return a [nanoarrow_array_stream][as_nanoarrow_array_stream] #' may require that the scope of some other object outlive that of the array #' stream. If there is a need for that object to be released deterministically #' (e.g., to close open files), you can register a function to run after the #' stream's release callback is invoked from the R thread. Note that this #' finalizer will **not** be run if the stream's release callback is invoked #' from a **non**-R thread. In this case, the finalizer and its chain of #' environments will be garbage-collected when `nanoarrow:::preserved_empty()` #' is run. #' #' @param array_stream A [nanoarrow_array_stream][as_nanoarrow_array_stream] #' @param finalizer A function that will be called with zero arguments. #' #' @return A newly allocated `array_stream` whose release callback will call #' the supplied finalizer. #' @export #' #' @examples #' stream <- array_stream_set_finalizer( #' basic_array_stream(list(1:5)), #' function() message("All done!") #' ) #' stream$release() #' array_stream_set_finalizer <- function(array_stream, finalizer) { stopifnot(is.function(finalizer)) prot <- new.env(parent = emptyenv()) prot$array_stream_finalizer <- finalizer class(prot) <- "nanoarrow_array_stream_finalizer" nanoarrow_pointer_set_protected(array_stream, prot) out <- nanoarrow_allocate_array_stream() nanoarrow_pointer_export(array_stream, out) out } #' Convert an object to a nanoarrow array_stream #' #' In nanoarrow, an 'array stream' corresponds to the `struct ArrowArrayStream` #' as defined in the Arrow C Stream interface. This object is used to represent #' a stream of [arrays][as_nanoarrow_array] with a common #' [schema][as_nanoarrow_schema]. This is similar to an #' [arrow::RecordBatchReader] except it can be used to represent a stream of #' any type (not just record batches). Note that a stream of record batches #' and a stream of non-nullable struct arrays are represented identically. #' Also note that array streams are mutable objects and are passed by #' reference and not by value. #' #' @param x An object to convert to a array_stream #' @param ... Passed to S3 methods #' @inheritParams as_nanoarrow_array #' #' @return An object of class 'nanoarrow_array_stream' #' @export #' #' @examples #' (stream <- as_nanoarrow_array_stream(data.frame(x = 1:5))) #' stream$get_schema() #' stream$get_next() #' #' # The last batch is returned as NULL #' stream$get_next() #' #' # Release the stream #' stream$release() #' as_nanoarrow_array_stream <- function(x, ..., schema = NULL) { UseMethod("as_nanoarrow_array_stream") } #' @export as_nanoarrow_array_stream.nanoarrow_array_stream <- function(x, ..., schema = NULL) { if (is.null(schema)) { return(x) } inferred_schema <- infer_nanoarrow_schema(x) if (nanoarrow_schema_identical(schema, inferred_schema)) { return(x) } NextMethod() } #' @export as_nanoarrow_array_stream.nanoarrow_array <- function(x, ..., schema = NULL) { if (is.null(schema)) { return(basic_array_stream(list(x), validate = FALSE)) } inferred_schema <- infer_nanoarrow_schema(x) if (nanoarrow_schema_identical(schema, inferred_schema)) { return(basic_array_stream(list(x), validate = FALSE)) } as_nanoarrow_array_stream( as_nanoarrow_array_stream(x), schema = schema ) } #' @export as_nanoarrow_array_stream.default <- function(x, ..., schema = NULL) { assert_arrow_installed("default coerce to nanoarrow_array_stream") as_nanoarrow_array_stream( arrow::as_record_batch_reader(x, ..., schema = arrow::as_schema(schema)), schema = schema ) } #' @export as_nanoarrow_array_stream.data.frame <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } else { schema <- as_nanoarrow_schema(schema) } x <- as_nanoarrow_array(x, schema = schema) basic_array_stream(list(x), schema = schema) } #' @export infer_nanoarrow_schema.nanoarrow_array_stream <- function(x, ...) { x$get_schema() } #' @export as.data.frame.nanoarrow_array_stream <- function(x, ...) { # Always release the input: we are always consuming the entire stream. # For more fine-grained behaviour on error, one can use # convert_array_stream() on.exit(x$release()) to <- infer_nanoarrow_ptype(x$get_schema()) if (!inherits(to, "data.frame")) { stop("Can't convert non-struct array stream to data.frame") } convert_array_stream(x, to) } #' @export as.vector.nanoarrow_array_stream <- function(x, mode) { on.exit(x$release()) convert_array_stream(x) } #' @importFrom utils str #' @export str.nanoarrow_array_stream <- function(object, ...) { cat(sprintf("%s\n", format(object))) if (nanoarrow_pointer_is_valid(object)) { # Use the str() of the list version but remove the first # line of the output ("List of 2") info <- list( get_schema = object$get_schema, get_next = object$get_next, release = object$release ) raw_str_output <- utils::capture.output(str(info, ..., give.attr = FALSE)) cat(paste0(raw_str_output[-1], collapse = "\n")) cat("\n") } invisible(object) } #' @export print.nanoarrow_array_stream <- function(x, ...) { str(x, ...) invisible(x) } #' @export format.nanoarrow_array_stream <- function(x, ...) { if (nanoarrow_pointer_is_valid(x)) { tryCatch( sprintf("", nanoarrow_schema_formatted(x$get_schema())), error = function(...) "" ) } else { "" } } # This is the list()-like interface to nanoarrow_array_stream that allows $ and [[ # to make nice auto-complete when interacting in an IDE #' @export length.nanoarrow_array_stream <- function(x, ...) { 3L } #' @export names.nanoarrow_array_stream <- function(x, ...) { c("get_schema", "get_next", "release") } #' @export `[[.nanoarrow_array_stream` <- function(x, i, ...) { force(x) if (identical(i, "get_schema") || isTRUE(i == 1L)) { function() .Call(nanoarrow_c_array_stream_get_schema, x) } else if (identical(i, "get_next") || isTRUE(i == 2L)) { function(schema = x$get_schema(), validate = TRUE) { array <- .Call(nanoarrow_c_array_stream_get_next, x) if (!nanoarrow_pointer_is_valid(array)) { return(NULL) } nanoarrow_array_set_schema(array, schema, validate = validate) array } } else if (identical(i, "release") || isTRUE(i == 3L)) { function() nanoarrow_pointer_release(x) } else { NULL } } #' @export `$.nanoarrow_array_stream` <- function(x, i, ...) { x[[i]] } nanoarrow/R/zzz.R0000644000176200001440000000643714547061553013465 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # nocov start .onLoad <- function(...) { register_vctrs_extension() s3_register("arrow::infer_type", "nanoarrow_array") s3_register("arrow::as_data_type", "nanoarrow_schema") s3_register("arrow::as_schema", "nanoarrow_schema") s3_register("arrow::as_arrow_array", "nanoarrow_array") s3_register("arrow::as_arrow_array", "nanoarrow_array_stream") s3_register("arrow::as_chunked_array", "nanoarrow_array") s3_register("arrow::as_chunked_array", "nanoarrow_array_stream") s3_register("arrow::as_record_batch", "nanoarrow_array") s3_register("arrow::as_arrow_table", "nanoarrow_array") s3_register("arrow::as_arrow_table", "nanoarrow_array_stream") s3_register("arrow::as_record_batch_reader", "nanoarrow_array_stream") s3_register("tibble::as_tibble", "nanoarrow_array") } # From the `vctrs` package (this function is intended to be copied # without attribution or license requirements to avoid a hard dependency on # vctrs: # https://github.com/r-lib/vctrs/blob/c2a7710fe55e3a2249c4fdfe75bbccbafcf38804/R/register-s3.R#L25-L31 s3_register <- function(generic, class, method = NULL) { stopifnot(is.character(generic), length(generic) == 1) stopifnot(is.character(class), length(class) == 1) pieces <- strsplit(generic, "::")[[1]] stopifnot(length(pieces) == 2) package <- pieces[[1]] generic <- pieces[[2]] caller <- parent.frame() get_method_env <- function() { top <- topenv(caller) if (isNamespace(top)) { asNamespace(environmentName(top)) } else { caller } } get_method <- function(method, env) { if (is.null(method)) { get(paste0(generic, ".", class), envir = get_method_env()) } else { method } } register <- function(...) { envir <- asNamespace(package) # Refresh the method each time, it might have been updated by # `devtools::load_all()` method_fn <- get_method(method) stopifnot(is.function(method_fn)) # Only register if generic can be accessed if (exists(generic, envir)) { registerS3method(generic, class, method_fn, envir = envir) } else if (identical(Sys.getenv("NOT_CRAN"), "true")) { warning(sprintf( "Can't find generic `%s` in package %s to register S3 method.", generic, package )) } } # Always register hook in case package is later unloaded & reloaded setHook(packageEvent(package, "onLoad"), register) # Avoid registration failures during loading (pkgload or regular) if (isNamespaceLoaded(package)) { register() } invisible() } # nocov end nanoarrow/R/extension.R0000644000176200001440000001415714502402562014630 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Register Arrow extension types #' #' @param extension_name An Arrow extension type name (e.g., arrow.r.vctrs) #' @param extension_spec An extension specification inheriting from #' 'nanoarrow_extension_spec'. #' @param data Optional data to include in the extension type specification #' @param subclass A subclass for the extension type specification. Extension #' methods will dispatch on this object. #' #' @return #' - `nanoarrow_extension_spec()` returns an object of class #' 'nanoarrow_extension_spec'. #' - `register_nanoarrow_extension()` returns `extension_spec`, invisibly. #' - `unregister_nanoarrow_extension()` returns `extension_name`, invisibly. #' - `resolve_nanoarrow_extension()` returns an object of class #' 'nanoarrow_extension_spec' or NULL if the extension type was not #' registered. #' @export #' #' @examples #' nanoarrow_extension_spec("mynamespace.mytype", subclass = "mypackage_mytype_spec") nanoarrow_extension_spec <- function(data = list(), subclass = character()) { structure( data, class = union(subclass, "nanoarrow_extension_spec") ) } #' @rdname nanoarrow_extension_spec #' @export register_nanoarrow_extension <- function(extension_name, extension_spec) { extension_registry[[extension_name]] <- extension_spec invisible(extension_name) } #' @rdname nanoarrow_extension_spec #' @export unregister_nanoarrow_extension <- function(extension_name) { extension_registry[[extension_name]] <- NULL invisible(extension_name) } #' @rdname nanoarrow_extension_spec #' @export resolve_nanoarrow_extension <- function(extension_name) { extension_registry[[extension_name]] } #' Implement Arrow extension types #' #' @inheritParams nanoarrow_extension_spec #' @param warn_unregistered Use `FALSE` to infer/convert based on the storage #' type without a warning. #' @param x,array,to,schema,... Passed from [infer_nanoarrow_ptype()], #' [convert_array()], [as_nanoarrow_array()], and/or #' [as_nanoarrow_array_stream()]. #' #' @return #' - `infer_nanoarrow_ptype_extension()`: The R vector prototype to be used #' as the default conversion target. #' - `convert_array_extension()`: An R vector of type `to`. #' - `as_nanoarrow_array_extension()`: A [nanoarrow_array][as_nanoarrow_array] #' of type `schema`. #' @export #' infer_nanoarrow_ptype_extension <- function(extension_spec, x, ..., warn_unregistered = TRUE) { UseMethod("infer_nanoarrow_ptype_extension") } #' @rdname infer_nanoarrow_ptype_extension #' @export convert_array_extension <- function(extension_spec, array, to, ..., warn_unregistered = TRUE) { UseMethod("convert_array_extension") } #' @rdname infer_nanoarrow_ptype_extension #' @export as_nanoarrow_array_extension <- function(extension_spec, x, ..., schema = NULL) { UseMethod("as_nanoarrow_array_extension") } #' @export infer_nanoarrow_ptype_extension.default <- function(extension_spec, x, ..., warn_unregistered = TRUE) { if (warn_unregistered) { warn_unregistered_extension_type(x) } x$metadata[["ARROW:extension:name"]] <- NULL infer_nanoarrow_ptype(x) } #' @export convert_array_extension.default <- function(extension_spec, array, to, ..., warn_unregistered = TRUE) { storage <- .Call(nanoarrow_c_infer_schema_array, array) if (warn_unregistered) { warn_unregistered_extension_type(storage) } storage$metadata[["ARROW:extension:name"]] <- NULL array <- array_shallow_copy(array, validate = FALSE) nanoarrow_array_set_schema(array, storage) convert_array(array, to, ...) } #' @export as_nanoarrow_array_extension.default <- function(extension_spec, x, ..., schema = NULL) { stop( sprintf( "as_nanoarrow_array_extension() not implemented for extension %s", nanoarrow_schema_formatted(schema) ) ) } #' Create Arrow extension arrays #' #' @param storage_array A [nanoarrow_array][as_nanoarrow_array]. #' @inheritParams na_type #' #' @return A [nanoarrow_array][as_nanoarrow_array] with attached extension #' schema. #' @export #' #' @examples #' nanoarrow_extension_array(1:10, "some_ext", '{"key": "value"}') #' nanoarrow_extension_array <- function(storage_array, extension_name, extension_metadata = NULL) { storage_array <- as_nanoarrow_array(storage_array) schema <- .Call(nanoarrow_c_infer_schema_array, storage_array) schema$metadata[["ARROW:extension:name"]] <- extension_name schema$metadata[["ARROW:extension:metadata"]] <- extension_metadata shallow_copy <- array_shallow_copy(storage_array) nanoarrow_array_set_schema(shallow_copy, schema) shallow_copy } warn_unregistered_extension_type <- function(x) { # Warn that we're about to ignore an extension type if (!is.null(x$name) && !identical(x$name, "")) { warning( sprintf( "%s: Converting unknown extension %s as storage type", x$name, nanoarrow_schema_formatted(x) ) ) } else { warning( sprintf( "Converting unknown extension %s as storage type", nanoarrow_schema_formatted(x) ) ) } } # Mutable registry to look up extension specifications extension_registry <- new.env(parent = emptyenv()) nanoarrow/R/util.R0000644000176200001440000001034614502402562013565 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. arrow_installed <- function() { opt <- Sys.getenv( "R_NANOARROW_WITHOUT_ARROW", getOption("nanoarrow.without_arrow", FALSE) ) if (identical(tolower(opt), "true")) { FALSE } else { requireNamespace("arrow", quietly = TRUE) } } assert_arrow_installed <- function(reason) { if (!arrow_installed()) { stop( sprintf("Package 'arrow' required for %s", reason), call. = FALSE ) } } warn_lossy_conversion <- function(count, msg) { cnd <- simpleWarning( sprintf("%d value(s) %s", count, msg), call = sys.call(-1) ) class(cnd) <- union("nanoarrow_warning_lossy_conversion", class(cnd)) warning(cnd) } # Internally we use R_PreserveObject() and R_ReleaseObject() to manage R objects # that must be kept alive for ArrowArray buffers to stay valid. This count # should be zero after tests have run in a fresh session and both gc() and # preserved_empty() have been run. If this isn't the case, compile with # -DNANOARROW_DEBUG_PRESERVE and run preserved_empty() to get verbose output # about which objects didn't get released (including an R traceback to where # they were preserved). preserved_count <- function() { .Call(nanoarrow_c_preserved_count) } # Most objects are both preserved and released on the R main thread; however # when sending objects into the wild there is no guarantee that they will be # deleted on the R main thread (even though they usually are). The R package # handles this by keeping a list of objects that couldn't be released: calling # this function will release them and return how many were released. preserved_empty <- function() { .Call(nanoarrow_c_preserved_empty) } # To test the "release from another thread" mechanism, this preserves obj, # releases it from another thread and returns. preserve_and_release_on_other_thread <- function(obj) { invisible(.Call(nanoarrow_c_preserve_and_release_on_other_thread, obj)) } # This is used by bookkeeping infrastructure when debugging an imbalance in # preserved/released SEXPs. current_stack_trace_chr <- function() { tb <- rlang::trace_back() paste0(utils::capture.output(print(tb)), collapse = "\n") } # Consolidate places we should call vctrs::vec_slice() # if/when a vctrs dependency is added vec_slice2 <- function(x, i) { if (is.data.frame(x)) { x[i, , drop = FALSE] } else { x[i] } } `%||%` <- function(rhs, lhs) { if (is.null(rhs)) lhs else rhs } new_data_frame <- function(x, nrow) { structure(x, row.names = c(NA, nrow), class = "data.frame") } vec_gen <- function(ptype, n = 1e3, prop_true = 0.5, prop_na = 0, chr_len = function(n) ceiling(25 * stats::runif(n))) { vec <- switch( class(ptype)[1], logical = stats::runif(n) < prop_true, integer = as.integer(stats::runif(n, min = -1, max = 1) * .Machine$integer.max), numeric = stats::runif(n), character = strrep(rep_len(letters, n), chr_len(n)), data.frame = new_data_frame( lapply( ptype, vec_gen, n = n, prop_true = prop_true, prop_na = prop_na, chr_len = chr_len ), n ), stop(sprintf("Don't know how to generate vector for type %s", class(ptype)[1])) ) if (!is.data.frame(vec) && prop_na > 0) { is_na <- stats::runif(n) < prop_na vec[is_na] <- ptype[NA_integer_] } vec } vec_shuffle <- function(x) { if (is.data.frame(x)) { i <- sample(seq_len(nrow(x)), replace = FALSE) } else { i <- sample(seq_along(x), replace = FALSE) } vec_slice2(x, i) } nanoarrow/R/infer-ptype.R0000644000176200001440000001254714502402562015057 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Infer an R vector prototype #' #' Resolves the default `to` value to use in [convert_array()] and #' [convert_array_stream()]. The default conversions are: #' #' - null to [vctrs::unspecified()] #' - boolean to [logical()] #' - int8, uint8, int16, uint16, and int13 to [integer()] #' - uint32, int64, uint64, float, and double to [double()] #' - string and large string to [character()] #' - struct to [data.frame()] #' - binary and large binary to [blob::blob()] #' - list, large_list, and fixed_size_list to [vctrs::list_of()] #' - time32 and time64 to [hms::hms()] #' - duration to [difftime()] #' - date32 to [as.Date()] #' - timestamp to [as.POSIXct()] #' #' Additional conversions are possible by specifying an explicit value for #' `to`. For details of each conversion, see [convert_array()]. #' #' @param x A [nanoarrow_schema][as_nanoarrow_schema], #' [nanoarrow_array][as_nanoarrow_array], or #' [nanoarrow_array_stream][as_nanoarrow_array_stream]. #' #' @return An R vector of zero size describing the target into which #' the array should be materialized. #' @export #' #' @examples #' infer_nanoarrow_ptype(as_nanoarrow_array(1:10)) #' infer_nanoarrow_ptype <- function(x) { if (inherits(x, "nanoarrow_array")) { x <- .Call(nanoarrow_c_infer_schema_array, x) } else if (inherits(x, "nanoarrow_array_stream")) { x <- .Call(nanoarrow_c_array_stream_get_schema, x) } else if (!inherits(x, "nanoarrow_schema")) { stop("`x` must be a nanoarrow_schema(), nanoarrow_array(), or nanoarrow_array_stream()") } .Call(nanoarrow_c_infer_ptype, x) } # This is called from C from nanoarrow_c_infer_ptype when all the C conversions # have been tried. Some of these inferences could be moved to C to be faster # (but are much less verbose to create here) infer_ptype_other <- function(schema) { # We don't need the user-friendly versions and this is performance-sensitive parsed <- .Call(nanoarrow_c_schema_parse, schema) # Give registered extension types a chance to resolve the ptype if (!is.null(parsed$extension_name)) { spec <- resolve_nanoarrow_extension(parsed$extension_name) return(infer_nanoarrow_ptype_extension(spec, schema)) } switch( parsed$type, "na" = vctrs::unspecified(), "binary" = , "large_binary" = new_blob_internal(), "date32" = structure(numeric(), class = "Date"), "time32" = , "time64" = hms::hms(), "duration" = structure(numeric(), class = "difftime", units = "secs"), "date64" = , "timestamp" = { if (is.null(parsed$timezone) || parsed$timezone == "") { # We almost never want to assume the user's timezone here, which is # what would happen if we passed on "". This is consistent with how # readr handles reading timezones (assign "UTC" since it's DST-free # and let the user explicitly set this later) parsed$timezone <- getOption("nanoarrow.timezone_if_unspecified", "UTC") } structure( numeric(0), class = c("POSIXct", "POSIXt"), tzone = parsed$timezone ) }, "map" = , "large_list" = , "list" = , "fixed_size_list" = { ptype <- infer_nanoarrow_ptype(schema$children[[1]]) vctrs::list_of(.ptype = ptype) }, "dictionary" = { # Even though R's 'factor' can handle a dictionary of strings # (perhaps the most common case), an array arriving in chunks may have # different dictionary arrays. Thus, the best type-stable default we can # achieve is to expand dictionaries. infer_nanoarrow_ptype(schema$dictionary) }, stop_cant_infer_ptype(schema, n = -1) ) } stop_cant_infer_ptype <- function(schema, n = 0) { schema_label <- nanoarrow_schema_formatted(schema) if (is.null(schema$name) || identical(schema$name, "")) { cnd <- simpleError( sprintf( "Can't infer R vector type for <%s>", schema_label ), call = sys.call(n - 1) ) } else { cnd <- simpleError( sprintf( "Can't infer R vector type for `%s` <%s>", schema$name, schema_label ), call = sys.call(n - 1) ) } stop(cnd) } # Try to load the blob namespace. If it fails, we still return the correct # ptype object. This is not ideal because the behaviour of the output object # may be slightly different if blob isn't installed; however, we use this # conversion for printing buffers and it's difficult to work around with the # current system for conversion. new_blob_internal <- function() { requireNamespace("blob", quietly = TRUE) structure( list(), ptype = raw(0), class = c("blob", "vctrs_list_of", "vctrs_vctr", "list") ) } nanoarrow/R/buffer.R0000644000176200001440000001632014547575511014075 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Convert an object to a nanoarrow buffer #' #' @param x An object to convert to a buffer #' @param ... Passed to S3 methods #' #' @return An object of class 'nanoarrow_buffer' #' @export #' #' @examples #' array <- as_nanoarrow_array(c(NA, 1:4)) #' array$buffers #' as.raw(array$buffers[[1]]) #' as.raw(array$buffers[[2]]) #' convert_buffer(array$buffers[[1]]) #' convert_buffer(array$buffers[[2]]) #' as_nanoarrow_buffer <- function(x, ...) { UseMethod("as_nanoarrow_buffer") } #' @export as_nanoarrow_buffer.nanoarrow_buffer <- function(x, ...) { x } #' @export as_nanoarrow_buffer.default <- function(x, ...) { msg <- NULL result <- tryCatch( .Call(nanoarrow_c_as_buffer_default, x), error = function(e) { msg <<- conditionMessage(e) NULL } ) if (is.null(result) && is.null(msg)) { cls <- paste(class(x), collapse = "/") stop(sprintf("Can't convert object of type %s to nanoarrow_buffer", cls)) } else if (is.null(result)) { cls <- paste(class(x), collapse = "/") stop(sprintf("Can't convert object of type %s to nanoarrow_buffer: %s", cls, msg)) } result } #' @importFrom utils str #' @export str.nanoarrow_buffer <- function(object, ..., db = F, indent.str = "", width = getOption("width")) { formatted <- format(object) cat(formatted) info <- nanoarrow_buffer_info(object) if (info$data_type == "unknown") { cat("\n") return(invisible(object)) } # Worst case is decimal256, which might occupy 2 output characters per # 256 buffer bytes per element. The actual number isn't too important here, # it's just important to make sure this won't try to format gigabytes of # text. width <- width - nchar(indent.str) - nchar(formatted) - 3 max_print_bytes <- width / 2 * 256 buffer_print <- nanoarrow_buffer_head_bytes(object, max_print_bytes) try({ array <- as_nanoarrow_array.nanoarrow_buffer(buffer_print) vector <- convert_array(array) # binary output here is just '[blob xxb]` which is not all that useful here if (inherits(vector, "blob")) { vector <- vector[[1]] } formatted_data <- paste( format(vector, trim = TRUE), collapse = " " ) cat(" `") if (nchar(formatted_data) > width) { cat(substr(formatted_data, 1, width - 3)) cat("...") } else { cat(formatted_data) } }, silent = TRUE) cat("`\n") invisible(object) } #' @export print.nanoarrow_buffer <- function(x, ...) { str(x, ...) invisible(x) } #' @export format.nanoarrow_buffer <- function(x, ...) { info <- nanoarrow_buffer_info(x) if (info$data_type == "unknown") { len <- "" } else if (info$element_size_bits == 0 || info$data_type %in% c("binary", "string")) { len <- sprintf("[%s b]", info$size_bytes) } else { logical_length <- (info$size_bytes * 8) %/% info$element_size_bits len <- sprintf("[%s][%s b]", logical_length, info$size_bytes) } sprintf( "<%s %s<%s>%s>", class(x)[1], info$type, info$data_type, len ) } #' Create and modify nanoarrow buffers #' #' @param buffer,new_buffer [nanoarrow_buffer][as_nanoarrow_buffer]s. #' @inheritParams convert_array #' #' @return #' - `nanoarrow_buffer_init()`: An object of class 'nanoarrow_buffer' #' - `nanoarrow_buffer_append()`: Returns `buffer`, invisibly. Note that #' `buffer` is modified in place by reference. #' @export #' #' @examples #' buffer <- nanoarrow_buffer_init() #' nanoarrow_buffer_append(buffer, 1:5) #' #' array <- nanoarrow_array_modify( #' nanoarrow_array_init(na_int32()), #' list(length = 5, buffers = list(NULL, buffer)) #' ) #' as.vector(array) #' nanoarrow_buffer_init <- function() { as_nanoarrow_buffer(NULL) } #' @rdname nanoarrow_buffer_init #' @export nanoarrow_buffer_append <- function(buffer, new_buffer) { buffer <- as_nanoarrow_buffer(buffer) new_buffer <- as_nanoarrow_buffer(new_buffer) .Call(nanoarrow_c_buffer_append, buffer, new_buffer) invisible(buffer) } #' @rdname nanoarrow_buffer_init #' @export convert_buffer <- function(buffer, to = NULL) { convert_array(as_nanoarrow_array.nanoarrow_buffer(buffer), to = to) } #' @export as_nanoarrow_array.nanoarrow_buffer <- function(x, ..., schema = NULL) { if (!is.null(schema)) { stop("as_nanoarrow_array() with non-NULL schema is not supported") } info <- nanoarrow_buffer_info(x) if (info$data_type %in% c("binary", "string")) { info$element_size_bits <- 8L } if (info$data_type == "unknown" || info$element_size_bits == 0) { stop("Can't convert buffer with unknown type or unknown element size") } data_type <- info$data_type logical_length <- (info$size_bytes * 8) %/% info$element_size_bits if (data_type %in% c("string", "binary") && logical_length <= .Machine$integer.max) { array <- nanoarrow_array_init(na_type(data_type)) offsets <- as.integer(c(0, logical_length)) nanoarrow_array_modify( array, list( length = 1, null_count = 0, buffers = list(NULL, offsets, x) ) ) } else if(data_type %in% c("string", "binary")) { array <- nanoarrow_array_init(na_type(paste0("large_", data_type))) offsets <- as_nanoarrow_array(c(0, logical_length), schema = na_int64())$buffers[[2]] nanoarrow_array_modify( array, list( length = 1, null_count = 0, buffers = list(NULL, offsets, x) ) ) } else { array <- nanoarrow_array_init(na_type(data_type)) nanoarrow_array_modify( array, list( length = logical_length, null_count = 0, buffers = list(NULL, x) ) ) } } #' @export as.raw.nanoarrow_buffer <- function(x, ...) { .Call(nanoarrow_c_buffer_as_raw, x) } #' @export as.vector.nanoarrow_buffer <- function(x, mode) { convert_buffer(x) } nanoarrow_buffer_info <- function(x) { .Call(nanoarrow_c_buffer_info, x) } nanoarrow_buffer_head_bytes <- function(x, max_bytes) { .Call(nanoarrow_c_buffer_head_bytes, x, as.double(max_bytes)[1]) } # This is the list()-like interface to nanoarrow_buffer that allows $ and [[ # to make nice auto-complete when interacting in an IDE #' @export length.nanoarrow_buffer <- function(x, ...) { 5L } #' @export names.nanoarrow_buffer <- function(x, ...) { c("data", "size_bytes", "capacity_bytes", "type", "data_type", "element_size_bits") } #' @export `[[.nanoarrow_buffer` <- function(x, i, ...) { nanoarrow_buffer_info(x)[[i]] } #' @export `$.nanoarrow_buffer` <- function(x, i, ...) { x[[i]] } nanoarrow/R/array.R0000644000176200001440000002655314502402562013735 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Convert an object to a nanoarrow array #' #' In nanoarrow an 'array' refers to the `struct ArrowArray` definition #' in the Arrow C data interface. At the R level, we attach a #' [schema][as_nanoarrow_schema] such that functionally the nanoarrow_array #' class can be used in a similar way as an `arrow::Array`. Note that in #' nanoarrow an `arrow::RecordBatch` and a non-nullable `arrow::StructArray` #' are represented identically. #' #' @param x An object to convert to a array #' @param schema An optional schema used to enforce conversion to a particular #' type. Defaults to [infer_nanoarrow_schema()]. #' @param ... Passed to S3 methods #' #' @return An object of class 'nanoarrow_array' #' @export #' #' @examples #' (array <- as_nanoarrow_array(1:5)) #' as.vector(array) #' #' (array <- as_nanoarrow_array(data.frame(x = 1:5))) #' as.data.frame(array) #' as_nanoarrow_array <- function(x, ..., schema = NULL) { UseMethod("as_nanoarrow_array") } # See as-array.R for S3 method implementations #' @export as.vector.nanoarrow_array <- function(x, mode = "any") { stopifnot(identical(mode, "any")) convert_array(x, to = infer_nanoarrow_ptype(x)) } #' @export as.data.frame.nanoarrow_array <- function(x, ...) { schema <- infer_nanoarrow_schema(x) if (schema$format != "+s") { stop( sprintf( "Can't convert array with type %s to data.frame()", nanoarrow_schema_formatted(schema) ) ) } .Call(nanoarrow_c_convert_array, x, NULL) } # exported in zzz.R as_tibble.nanoarrow_array <- function(x, ...) { tibble::as_tibble(as.data.frame.nanoarrow_array(x), ...) } #' @export infer_nanoarrow_schema.nanoarrow_array <- function(x, ...) { .Call(nanoarrow_c_infer_schema_array, x) %||% stop("nanoarrow_array() has no associated schema") } #' @importFrom utils str #' @export str.nanoarrow_array <- function(object, ...) { cat(sprintf("%s\n", format(object, .recursive = FALSE))) if (nanoarrow_pointer_is_valid(object)) { # Use the str() of the list version but remove the first # line of the output ("List of 6") info <- nanoarrow_array_proxy_safe(object) raw_str_output <- utils::capture.output(str(info, ...)) cat(paste0(raw_str_output[-1], collapse = "\n")) cat("\n") } invisible(object) } #' @export print.nanoarrow_array <- function(x, ...) { str(x, ...) invisible(x) } #' @export format.nanoarrow_array <- function(x, ..., .recursive = TRUE) { if (nanoarrow_pointer_is_valid(x)) { schema <- .Call(nanoarrow_c_infer_schema_array, x) if (is.null(schema)) { sprintf("[%s]>", x$length) } else { sprintf( "", nanoarrow_schema_formatted(schema, .recursive), x$length ) } } else { "" } } # This is the list()-like interface to nanoarrow_array that allows $ and [[ # to make nice auto-complete for the array fields #' @export length.nanoarrow_array <- function(x, ...) { 6L } #' @export names.nanoarrow_array <- function(x, ...) { c("length", "null_count", "offset", "buffers", "children", "dictionary") } #' @export `[[.nanoarrow_array` <- function(x, i, ...) { nanoarrow_array_proxy_safe(x)[[i]] } #' @export `$.nanoarrow_array` <- function(x, i, ...) { nanoarrow_array_proxy_safe(x)[[i]] } #' @export `[[<-.nanoarrow_array` <- function(x, i, value) { if (is.numeric(i) && isTRUE(i %in% 1:6)) { i <- names.nanoarrow_array()[[i]] } if (is.character(i) && (length(i) == 1L) && !is.na(i)) { new_values <- list(value) names(new_values) <- i return(nanoarrow_array_modify(x, new_values)) } stop("`i` must be character(1) or integer(1) %in% 1:6") } #' @export `$<-.nanoarrow_array` <- function(x, i, value) { new_values <- list(value) names(new_values) <- i nanoarrow_array_modify(x, new_values) } # A version of nanoarrow_array_proxy() that is less likely to error for invalid # arrays and/or schemas nanoarrow_array_proxy_safe <- function(array, recursive = FALSE) { schema <- .Call(nanoarrow_c_infer_schema_array, array) tryCatch( nanoarrow_array_proxy(array, schema = schema, recursive = recursive), error = function(...) nanoarrow_array_proxy(array, recursive = recursive) ) } nanoarrow_array_proxy <- function(array, schema = NULL, recursive = FALSE) { if (!is.null(schema)) { array_view <- .Call(nanoarrow_c_array_view, array, schema) result <- .Call(nanoarrow_c_array_proxy, array, array_view, recursive) names(result$children) <- names(schema$children) if (!recursive) { # Pass on some information from the schema if we have it result$children <- Map( nanoarrow_array_set_schema, result$children, schema$children ) if (!is.null(result$dictionary)) { nanoarrow_array_set_schema(result$dictionary, schema$dictionary) } } } else { result <- .Call(nanoarrow_c_array_proxy, array, NULL, recursive) } result } #' Modify nanoarrow arrays #' #' Create a new array or from an existing array, modify one or more parameters. #' When importing an array from elsewhere, `nanoarrow_array_set_schema()` is #' useful to attach the data type information to the array (without this #' information there is little that nanoarrow can do with the array since its #' content cannot be otherwise interpreted). `nanoarrow_array_modify()` can #' create a shallow copy and modify various parameters to create a new array, #' including setting children and buffers recursively. These functions power the #' `$<-` operator, which can modify one parameter at a time. #' #' @param array A [nanoarrow_array][as_nanoarrow_array]. #' @param schema A [nanoarrow_schema][as_nanoarrow_schema] to attach to this #' `array`. #' @param new_values A named `list()` of values to replace. #' @param validate Use `FALSE` to skip validation. Skipping validation may #' result in creating an array that will crash R. #' #' @return #' - `nanoarrow_array_init()` returns a possibly invalid but initialized #' array with a given `schema`. #' - `nanoarrow_array_set_schema()` returns `array`, invisibly. Note that #' `array` is modified in place by reference. #' - `nanoarrow_array_modify()` returns a shallow copy of `array` with the #' modified parameters such that the original array remains valid. #' @export #' #' @examples #' nanoarrow_array_init(na_string()) #' #' # Modify an array using $ and <- #' array <- as_nanoarrow_array(1:5) #' array$length <- 4 #' as.vector(array) #' #' # Modify potentially more than one component at a time #' array <- as_nanoarrow_array(1:5) #' as.vector(nanoarrow_array_modify(array, list(length = 4))) #' #' # Attach a schema to an array #' array <- as_nanoarrow_array(-1L) #' nanoarrow_array_set_schema(array, na_uint32()) #' as.vector(array) #' nanoarrow_array_init <- function(schema) { .Call(nanoarrow_c_array_init, schema) } #' @rdname nanoarrow_array_init #' @export nanoarrow_array_set_schema <- function(array, schema, validate = TRUE) { .Call(nanoarrow_c_array_set_schema, array, schema, as.logical(validate)[1]) invisible(array) } #' @rdname nanoarrow_array_init #' @export nanoarrow_array_modify <- function(array, new_values, validate = TRUE) { array <- as_nanoarrow_array(array) if (length(new_values) == 0) { return(array) } # Make sure new_values has names to iterate over new_names <- names(new_values) if (is.null(new_names) || all(new_names == "", na.rm = TRUE)) { stop("`new_values` must be named") } # Make a copy and modify it. This is a deep copy in the sense that all # children are modifiable; however, it's a shallow copy in the sense that # none of the buffers are copied. schema <- .Call(nanoarrow_c_infer_schema_array, array) array_copy <- array_shallow_copy(array, schema, validate = validate) for (i in seq_along(new_values)) { nm <- new_names[i] value <- new_values[[i]] switch( nm, length = .Call(nanoarrow_c_array_set_length, array_copy, as.double(value)), null_count = .Call(nanoarrow_c_array_set_null_count, array_copy, as.double(value)), offset = .Call(nanoarrow_c_array_set_offset, array_copy, as.double(value)), buffers = { value <- lapply(value, as_nanoarrow_buffer) .Call(nanoarrow_c_array_set_buffers, array_copy, value) }, children = { value <- lapply(value, as_nanoarrow_array) value_copy <- lapply(value, array_shallow_copy, validate = validate) .Call(nanoarrow_c_array_set_children, array_copy, value_copy) if (!is.null(schema)) { schema <- nanoarrow_schema_modify( schema, list(children = lapply(value, infer_nanoarrow_schema)), validate = validate ) } }, dictionary = { if (!is.null(value)) { value <- as_nanoarrow_array(value) value_copy <- array_shallow_copy(value, validate = validate) } else { value_copy <- NULL } .Call(nanoarrow_c_array_set_dictionary, array_copy, value_copy) if (!is.null(schema) && !is.null(value)) { schema <- nanoarrow_schema_modify( schema, list(dictionary = infer_nanoarrow_schema(value)), validate = validate ) } else if (!is.null(schema)) { schema <- nanoarrow_schema_modify( schema, list(dictionary = NULL), validate = validate ) } }, stop(sprintf("Can't modify array[[%s]]: does not exist", deparse(nm))) ) } if (!is.null(schema) && validate) { array_copy <- .Call(nanoarrow_c_array_validate_after_modify, array_copy, schema) } if (!is.null(schema)) { nanoarrow_array_set_schema(array_copy, schema, validate = validate) } array_copy } array_shallow_copy <- function(array, schema = NULL, validate = TRUE) { array_copy <- nanoarrow_allocate_array() nanoarrow_pointer_export(array, array_copy) schema <- schema %||% .Call(nanoarrow_c_infer_schema_array, array) # For validation, use some of the infrastructure we already have in place # to make sure array_copy knows how long each buffer is if (!is.null(schema) && validate) { copy_buffers_recursive(array, array_copy) } array_copy } copy_buffers_recursive <- function(array, array_copy) { proxy <- nanoarrow_array_proxy_safe(array) proxy_copy <- nanoarrow_array_proxy(array_copy) .Call(nanoarrow_c_array_set_buffers, array_copy, proxy$buffers) for (i in seq_along(proxy$children)) { copy_buffers_recursive(proxy$children[[i]], proxy_copy$children[[i]]) } if (!is.null(proxy$dictionary)) { copy_buffers_recursive(proxy$dictionary, proxy_copy$dictionary) } } nanoarrow/R/nanoarrow-package.R0000644000176200001440000000257214355103326016213 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' @keywords internal "_PACKAGE" ## usethis namespace: start #' @importFrom utils getFromNamespace #' @useDynLib nanoarrow, .registration = TRUE ## usethis namespace: end NULL #' Underlying 'nanoarrow' C library build #' #' @param runtime Compare TRUE and FALSE values to detect a #' possible ABI mismatch. #' #' @return A string identifying the version of nanoarrow this package #' was compiled against. #' @export #' #' @examples #' nanoarrow_version() #' nanoarrow_version <- function(runtime = TRUE) { if (runtime) { .Call(nanoarrow_c_version_runtime) } else { .Call(nanoarrow_c_version) } } nanoarrow/R/convert-array.R0000644000176200001440000002154414502402562015406 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Convert an Array into an R vector #' #' Converts `array` to the type specified by `to`. This is a low-level interface; #' most users should use `as.data.frame()` or `as.vector()` unless finer-grained #' control is needed over the conversion. This function is an S3 generic #' dispatching on `to`: developers may implement their own S3 methods for #' custom vector types. #' #' @param array A [nanoarrow_array][as_nanoarrow_array]. #' @param to A target prototype object describing the type to which `array` #' should be converted, or `NULL` to use the default conversion as #' returned by [infer_nanoarrow_ptype()]. Alternatively, a function can be #' passed to perform an alternative calculation of the default ptype as #' a function of `array` and the default inference of the prototype. #' @param ... Passed to S3 methods #' #' @return An R vector of type `to`. #' @export #' #' @details #' Conversions are implemented for the following R vector types: #' #' - [logical()]: Any numeric type can be converted to [logical()] in addition #' to the bool type. For numeric types, any non-zero value is considered `TRUE`. #' - [integer()]: Any numeric type can be converted to [integer()]; however, #' a warning will be signaled if the any value is outside the range of the #' 32-bit integer. #' - [double()]: Any numeric type can be converted to [double()]. This #' conversion currently does not warn for values that may not roundtrip #' through a floating-point double (e.g., very large uint64 and int64 values). #' - [character()]: String and large string types can be converted to #' [character()]. The conversion does not check for valid UTF-8: if you need #' finer-grained control over encodings, use `to = blob::blob()`. #' - [factor()]: Dictionary-encoded arrays of strings can be converted to #' `factor()`; however, this must be specified explicitly (i.e., #' `convert_array(array, factor())`) because arrays arriving #' in chunks can have dictionaries that contain different levels. Use #' `convert_array(array, factor(levels = c(...)))` to materialize an array #' into a vector with known levels. #' - [Date][as.Date()]: Only the date32 type can be converted to an R Date vector. #' - [hms::hms()]: Time32 and time64 types can be converted to [hms::hms()]. #' - [difftime()]: Time32, time64, and duration types can be converted to #' R [difftime()] vectors. The value is converted to match the [units()] #' attribute of `to`. #' - [blob::blob()]: String, large string, binary, and large binary types can #' be converted to [blob::blob()]. #' - [vctrs::list_of()]: List, large list, and fixed-size list types can be #' converted to [vctrs::list_of()]. #' - [data.frame()]: Struct types can be converted to [data.frame()]. #' - [vctrs::unspecified()]: Any type can be converted to [vctrs::unspecified()]; #' however, a warning will be raised if any non-null values are encountered. #' #' In addition to the above conversions, a null array may be converted to any #' target prototype except [data.frame()]. Extension arrays are currently #' converted as their storage type. #' #' @examples #' array <- as_nanoarrow_array(data.frame(x = 1:5)) #' str(convert_array(array)) #' str(convert_array(array, to = data.frame(x = double()))) #' convert_array <- function(array, to = NULL, ...) { stopifnot(inherits(array, "nanoarrow_array")) UseMethod("convert_array", to) } #' @export convert_array.default <- function(array, to = NULL, ..., .from_c = FALSE) { if (.from_c) { # Handle extension conversion # We don't need the user-friendly versions and this is performance-sensitive schema <- .Call(nanoarrow_c_infer_schema_array, array) parsed <- .Call(nanoarrow_c_schema_parse, schema) if (!is.null(parsed$extension_name)) { spec <- resolve_nanoarrow_extension(parsed$extension_name) return(convert_array_extension(spec, array, to, ...)) } # Handle default dictionary conversion since it's the same for all types dictionary <- array$dictionary if (!is.null(dictionary)) { values <- .Call(nanoarrow_c_convert_array, dictionary, to) array$dictionary <- NULL indices <- .Call(nanoarrow_c_convert_array, array, integer()) return(vec_slice2(values, indices + 1L)) } stop_cant_convert_array(array, to) } if (is.function(to)) { to <- to(array, infer_nanoarrow_ptype(array)) } .Call(nanoarrow_c_convert_array, array, to) } # This is defined because it's verbose to pass named arguments from C. # When converting data frame columns, we try the internal C conversions # first to save R evaluation overhead. When the internal conversions fail, # we call convert_array() to dispatch to conversions defined via S3 # dispatch, making sure to let the default method know that we've already # tried the internal C conversions. convert_fallback_other <- function(array, offset, length, to) { # If we need to modify offset/length, do it using a shallow copy. if (!is.null(offset)) { array <- nanoarrow_array_modify( array, list(offset = offset, length = length), validate = FALSE ) } # Call convert_array() on a single chunk. Use .from_c = TRUE to ensure that # methods do not attempt to pass the same array back to the C conversions. # When the result is passed back to C it is checked enough to avoid segfault # but not necessarily for correctness (e.g., factors with levels that don't # correspond to 'to'). This result may be used as-is or may be copied into # a slice of another vector. convert_array(array, to, .from_c = TRUE) } #' @export convert_array.double <- function(array, to, ...) { # Handle conversion from decimal128 via arrow schema <- infer_nanoarrow_schema(array) parsed <- nanoarrow_schema_parse(schema) if (parsed$type == "decimal128") { assert_arrow_installed( sprintf( "convert %s array to object of type double", nanoarrow_schema_formatted(schema) ) ) arrow_array <- as_arrow_array.nanoarrow_array(array) arrow_array$as_vector() } else { NextMethod() } } #' @export convert_array.vctrs_partial_frame <- function(array, to, ...) { ptype <- infer_nanoarrow_ptype(array) if (!is.data.frame(ptype)) { stop_cant_convert_array(array, to) } ptype <- vctrs::vec_ptype_common(ptype, to) .Call(nanoarrow_c_convert_array, array, ptype) } #' @export convert_array.factor <- function(array, to, ...) { if (!is.null(array$dictionary)) { levels_final <- levels(to) levels <- convert_array(array$dictionary, character()) array$dictionary <- NULL indices <- convert_array(array, integer()) + 1L # Handle empty factor() as the sentinel for "auto levels" if (identical(levels(to), character())) { levels(to) <- levels } if (identical(levels, levels(to))) { fct_data <- indices } else if (all(levels %in% levels(to))) { level_map <- match(levels, levels(to)) fct_data <- level_map[indices] } else { stop("Error converting to factor: some levels in data do not exist in levels") } } else { strings <- convert_array(array, character()) # Handle empty factor() as the sentinel for "auto levels" if (identical(levels(to), character())) { fct_data <- factor(strings, levels) levels(to) <- levels(fct_data) } else { fct_data <- factor(strings, levels = levels(to)) } } # Restore other attributes (e.g., ordered, labels) attributes(fct_data) <- attributes(to) fct_data } stop_cant_convert_array <- function(array, to, n = 0) { stop_cant_convert_schema(infer_nanoarrow_schema(array), to, n - 1) } stop_cant_convert_schema <- function(schema, to, n = 0) { schema_label <- nanoarrow_schema_formatted(schema) if (is.null(schema$name) || identical(schema$name, "")) { cnd <- simpleError( sprintf( "Can't convert array <%s> to R vector of type %s", schema_label, class(to)[1] ), call = sys.call(n - 1) ) } else { cnd <- simpleError( sprintf( "Can't convert `%s` <%s> to R vector of type %s", schema$name, schema_label, class(to)[1] ), call = sys.call(n - 1) ) } stop(cnd) } nanoarrow/R/convert-array-stream.R0000644000176200001440000001121214502402562016666 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Convert an Array Stream into an R vector #' #' Converts `array_stream` to the type specified by `to`. This is a low-level #' interface; most users should use `as.data.frame()` or `as.vector()` unless #' finer-grained control is needed over the conversion. See [convert_array()] #' for details of the conversion process; see [infer_nanoarrow_ptype()] for #' default inferences of `to`. #' #' @param array_stream A [nanoarrow_array_stream][as_nanoarrow_array_stream]. #' @param size The exact size of the output, if known. If specified, #' slightly more efficient implementation may be used to collect the output. #' @param n The maximum number of batches to pull from the array stream. #' @inheritParams convert_array #' @inheritParams basic_array_stream #' #' @return #' - `convert_array_stream()`: An R vector of type `to`. #' - `collect_array_stream()`: A `list()` of [nanoarrow_array][as_nanoarrow_array] #' @export #' #' @examples #' stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) #' str(convert_array_stream(stream)) #' str(convert_array_stream(stream, to = data.frame(x = double()))) #' #' stream <- as_nanoarrow_array_stream(data.frame(x = 1:5)) #' collect_array_stream(stream) #' convert_array_stream <- function(array_stream, to = NULL, size = NULL, n = Inf) { stopifnot( inherits(array_stream, "nanoarrow_array_stream") ) schema <- .Call(nanoarrow_c_array_stream_get_schema, array_stream) if (is.null(to)) { to <- infer_nanoarrow_ptype(schema) } else if (is.function(to)) { to <- to(schema, infer_nanoarrow_ptype(schema)) } n <- as.double(n)[1] if (!is.null(size)) { # The underlying nanoarrow_c_convert_array_stream() currently requires that # the total length of all batches is known in advance. If the caller # provided this we can save a bit of work. .Call( nanoarrow_c_convert_array_stream, array_stream, to, as.double(size)[1], n ) } else { # Otherwise, we need to collect all batches and calculate the total length # before calling nanoarrow_c_convert_array_stream(). batches <- collect_array_stream( array_stream, n, schema = schema, validate = FALSE ) # If there is exactly one batch, use convert_array(). Converting a single # array currently takes a more efficient code path for types that can be # converted as ALTREP (e.g., strings). if (length(batches) == 1L) { return(.Call(nanoarrow_c_convert_array, batches[[1]], to)) } # Otherwise, compute the final size, create another array stream, # and call convert_array_stream() with a known size. Using .Call() # directly because we have already type checked the inputs. size <- .Call(nanoarrow_c_array_list_total_length, batches) basic_stream <- .Call(nanoarrow_c_basic_array_stream, batches, schema, FALSE) .Call( nanoarrow_c_convert_array_stream, basic_stream, to, as.double(size), Inf ) } } #' @rdname convert_array_stream #' @export collect_array_stream <- function(array_stream, n = Inf, schema = NULL, validate = TRUE) { stopifnot( inherits(array_stream, "nanoarrow_array_stream") ) if (is.null(schema)) { schema <- .Call(nanoarrow_c_array_stream_get_schema, array_stream) } batches <- vector("list", 1024L) n_batches <- 0 get_next <- array_stream$get_next while (n_batches < n) { array <- get_next(schema, validate = validate) if (is.null(array)) { break } n_batches <- n_batches + 1 # This assignment has reasonable (but not great) performance when # n_batches > 1024 in recent versions of R because R overallocates vectors # slightly to support this pattern. It may be worth moving this # implementation to C or C++ in the future if the collect step becomes a # bottleneck. batches[[n_batches]] <- array } batches[seq_len(n_batches)] } nanoarrow/R/type.R0000644000176200001440000003227114502402562013572 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Create type objects #' #' In nanoarow, types, fields, and schemas are all represented by a #' [nanoarrow_schema][as_nanoarrow_schema]. These functions are convenience #' constructors to create these objects in a readable way. Use [na_type()] to #' construct types based on the constructor name, which is also the name that #' prints/is returned by [nanoarrow_schema_parse()]. #' #' @param type_name The name of the type (e.g., "int32"). This form of the #' constructor is useful for writing tests that loop over many types. #' @param byte_width For [na_fixed_size_binary()], the number of bytes #' occupied by each item. #' @param list_size The number of elements in each item in a #' [na_fixed_size_list()]. #' @param precision The total number of digits representable by the decimal type #' @param scale The number of digits after the decimal point in a decimal type #' @param unit One of 's' (seconds), 'ms' (milliseconds), 'us' (microseconds), #' or 'ns' (nanoseconds). #' @param timezone A string representing a timezone name. The empty string "" #' represents a naive point in time (i.e., one that has no associated #' timezone). #' @param column_types A `list()` of [nanoarrow_schema][as_nanoarrow_schema]s. #' @param item_type For [na_list()], [na_large_list()], [na_fixed_size_list()], #' and [na_map()], the [nanoarrow_schema][as_nanoarrow_schema] representing #' the item type. #' @param key_type The [nanoarrow_schema][as_nanoarrow_schema] representing the #' [na_map()] key type. #' @param index_type The [nanoarrow_schema][as_nanoarrow_schema] representing the #' [na_dictionary()] index type. #' @param value_type The [nanoarrow_schema][as_nanoarrow_schema] representing the #' [na_dictionary()] or [na_map()] value type. #' @param keys_sorted Use `TRUE` to assert that keys are sorted. #' @param storage_type For [na_extension()], the underlying value type. #' @param extension_name For [na_extension()], the extension name. This is #' typically namespaced separated by dots (e.g., arrow.r.vctrs). #' @param extension_metadata A string or raw vector defining extension metadata. #' Most Arrow extension types define extension metadata as a JSON object. #' @param nullable Use `FALSE` to assert that this field cannot contain #' null values. #' @param ordered Use `TRUE` to assert that the order of values in the #' dictionary are meaningful. #' #' @return A [nanoarrow_schema][as_nanoarrow_schema] #' @export #' #' @examples #' na_int32() #' na_struct(list(col1 = na_int32())) #' na_type <- function(type_name, byte_width = NULL, unit = NULL, timezone = NULL, column_types = NULL, item_type = NULL, key_type = NULL, value_type = NULL, index_type = NULL, ordered = NULL, list_size = NULL, keys_sorted = NULL, storage_type = NULL, extension_name = NULL, extension_metadata = NULL, nullable = NULL) { # Create a call and evaluate it. This leads to reasonable error messages # regarding nonexistent type names and extraneous or missing parameters. args <- list( byte_width = byte_width, unit = unit, timezone = timezone, column_types = column_types, item_type = item_type, key_type = key_type, value_type = value_type, index_type = index_type, ordered = ordered, list_size = list_size, keys_sorted = keys_sorted, storage_type = storage_type, extension_name = extension_name, extension_metadata = extension_metadata, nullable = nullable ) args <- args[!vapply(args, is.null, logical(1))] constructor <- as.symbol(paste0("na_", type_name)) call_obj <- as.call(c(list(constructor), args)) eval(call_obj) } #' @rdname na_type #' @export na_na <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE[["NA"]], isTRUE(nullable)) } #' @rdname na_type #' @export na_bool <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$BOOL, isTRUE(nullable)) } #' @rdname na_type #' @export na_int8 <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$INT8, isTRUE(nullable)) } #' @rdname na_type #' @export na_uint8 <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$UINT8, isTRUE(nullable)) } #' @rdname na_type #' @export na_int16 <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$INT16, isTRUE(nullable)) } #' @rdname na_type #' @export na_uint16 <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$UINT16, isTRUE(nullable)) } #' @rdname na_type #' @export na_int32 <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$INT32, isTRUE(nullable)) } #' @rdname na_type #' @export na_uint32 <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$UINT32, isTRUE(nullable)) } #' @rdname na_type #' @export na_int64 <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$INT64, isTRUE(nullable)) } #' @rdname na_type #' @export na_uint64 <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$UINT64, isTRUE(nullable)) } #' @rdname na_type #' @export na_half_float <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$HALF_FLOAT, isTRUE(nullable)) } #' @rdname na_type #' @export na_float <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$FLOAT, isTRUE(nullable)) } #' @rdname na_type #' @export na_double <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$DOUBLE, isTRUE(nullable)) } #' @rdname na_type #' @export na_string <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$STRING, isTRUE(nullable)) } #' @rdname na_type #' @export na_large_string <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$LARGE_STRING, isTRUE(nullable)) } #' @rdname na_type #' @export na_binary <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$BINARY, isTRUE(nullable)) } #' @rdname na_type #' @export na_large_binary <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$LARGE_BINARY, isTRUE(nullable)) } #' @rdname na_type #' @export na_fixed_size_binary <- function(byte_width, nullable = TRUE) { .Call( nanoarrow_c_schema_init_fixed_size, NANOARROW_TYPE$FIXED_SIZE_BINARY, as.integer(byte_width)[1], isTRUE(nullable) ) } #' @rdname na_type #' @export na_date32 <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$DATE32, isTRUE(nullable)) } #' @rdname na_type #' @export na_date64 <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$DATE64, isTRUE(nullable)) } #' @rdname na_type #' @export na_time32 <- function(unit = c("ms", "s"), nullable = TRUE) { unit <- match.arg(unit) .Call( nanoarrow_c_schema_init_date_time, NANOARROW_TYPE$TIME32, time_unit_id(unit), NULL, isTRUE(nullable) ) } #' @rdname na_type #' @export na_time64 <- function(unit = c("us", "ns"), nullable = TRUE) { unit <- match.arg(unit) .Call( nanoarrow_c_schema_init_date_time, NANOARROW_TYPE$TIME64, time_unit_id(unit), NULL, isTRUE(nullable) ) } #' @rdname na_type #' @export na_duration <- function(unit = c("ms", "s", "us", "ns"), nullable = TRUE) { unit <- match.arg(unit) .Call( nanoarrow_c_schema_init_date_time, NANOARROW_TYPE$DURATION, time_unit_id(unit), NULL, isTRUE(nullable) ) } #' @rdname na_type #' @export na_interval_months <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$INTERVAL_MONTHS, isTRUE(nullable)) } #' @rdname na_type #' @export na_interval_day_time <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$INTERVAL_DAY_TIME, isTRUE(nullable)) } #' @rdname na_type #' @export na_interval_month_day_nano <- function(nullable = TRUE) { .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$INTERVAL_MONTH_DAY_NANO, isTRUE(nullable)) } #' @rdname na_type #' @export na_timestamp <- function(unit = c("us", "ns", "s", "ms"), timezone = "", nullable = TRUE) { unit <- match.arg(unit) if (!is.character(timezone) || length(timezone) != 1 || is.na(timezone)) { stop("`timezone` must be character(1)") } .Call( nanoarrow_c_schema_init_date_time, NANOARROW_TYPE$TIMESTAMP, time_unit_id(unit), timezone, isTRUE(nullable) ) } #' @rdname na_type #' @export na_decimal128 <- function(precision, scale, nullable = TRUE) { .Call( nanoarrow_c_schema_init_decimal, NANOARROW_TYPE$DECIMAL128, as.integer(precision)[1], as.integer(scale)[1], isTRUE(nullable) ) } #' @rdname na_type #' @export na_decimal256 <- function(precision, scale, nullable = TRUE) { .Call( nanoarrow_c_schema_init_decimal, NANOARROW_TYPE$DECIMAL256, as.integer(precision)[1], as.integer(scale)[1], isTRUE(nullable) ) } #' @rdname na_type #' @export na_struct <- function(column_types = list(), nullable = FALSE) { schema <- .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$STRUCT, isTRUE(nullable)) schema$children <- column_types schema } #' @rdname na_type #' @export na_sparse_union <- function(column_types = list()) { schema <- na_struct(column_types) schema$format <- paste0("+us:", paste(seq_along(schema$children) - 1L, collapse = ",")) schema } #' @rdname na_type #' @export na_dense_union <- function(column_types = list()) { schema <- na_struct(column_types) schema$format <- paste0("+ud:", paste(seq_along(schema$children) - 1L, collapse = ",")) schema } #' @rdname na_type #' @export na_list <- function(item_type, nullable = TRUE) { schema <- .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$LIST, isTRUE(nullable)) schema$children[[1]] <- item_type schema } #' @rdname na_type #' @export na_large_list <- function(item_type, nullable = TRUE) { schema <- .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$LARGE_LIST, isTRUE(nullable)) schema$children[[1]] <- item_type schema } #' @rdname na_type #' @export na_fixed_size_list <- function(item_type, list_size, nullable = TRUE) { schema <- .Call( nanoarrow_c_schema_init_fixed_size, NANOARROW_TYPE$FIXED_SIZE_LIST, as.integer(list_size)[1], isTRUE(nullable) ) schema$children[[1]] <- item_type schema } #' @rdname na_type #' @export na_map <- function(key_type, item_type, keys_sorted = FALSE, nullable = TRUE) { schema <- .Call(nanoarrow_c_schema_init, NANOARROW_TYPE$MAP, isTRUE(nullable)) schema$children[[1]]$children[[1]] <- key_type schema$children[[1]]$children[[2]] <- item_type schema } #' @rdname na_type #' @export na_dictionary <- function(value_type, index_type = na_int32(), ordered = FALSE) { index_type <- as_nanoarrow_schema(index_type) index_type$dictionary <- value_type if (ordered) { index_type$flags <- bitwOr(index_type$flags, ARROW_FLAG$DICTIONARY_ORDERED) } else { index_type$flags <- bitwAnd( index_type$flags, bitwNot(ARROW_FLAG$DICTIONARY_ORDERED) ) } index_type } #' @rdname na_type #' @export na_extension <- function(storage_type, extension_name, extension_metadata = "") { storage_type <- as_nanoarrow_schema(storage_type) new_metadata <- list( "ARROW:extension:name" = extension_name, "ARROW:extension:metadata" = extension_metadata ) new_metadata <- c(new_metadata, storage_type$metadata) storage_type$metadata <- new_metadata[unique(names(new_metadata))] storage_type } time_unit_id <- function(time_unit) { match(time_unit, c("s", "ms", "us", "ns")) - 1L } # These values aren't guaranteed to stay stable between nanoarrow versions, # so we keep them internal but use them in these functions to simplify the # number of C functions we need to build all the types. NANOARROW_TYPE <- list( UNINITIALIZED = 0, "NA" = 1L, BOOL = 2L, UINT8 = 3L, INT8 = 4L, UINT16 = 5L, INT16 = 6L, UINT32 = 7L, INT32 = 8L, UINT64 = 9L, INT64 = 10L, HALF_FLOAT = 11L, FLOAT = 12L, DOUBLE = 13L, STRING = 14L, BINARY = 15L, FIXED_SIZE_BINARY = 16L, DATE32 = 17L, DATE64 = 18L, TIMESTAMP = 19L, TIME32 = 20L, TIME64 = 21L, INTERVAL_MONTHS = 22L, INTERVAL_DAY_TIME = 23L, DECIMAL128 = 24L, DECIMAL256 = 25L, LIST = 26L, STRUCT = 27L, SPARSE_UNION = 28L, DENSE_UNION = 29L, DICTIONARY = 30L, MAP = 31L, EXTENSION = 32L, FIXED_SIZE_LIST = 33L, DURATION = 34L, LARGE_STRING = 35L, LARGE_BINARY = 36L, LARGE_LIST = 37L, INTERVAL_MONTH_DAY_NANO = 38L ) ARROW_FLAG <- list( DICTIONARY_ORDERED = 1L, NULLABLE = 2L, MAP_KEYS_SORTED = 4L ) nanoarrow/R/pointers.R0000644000176200001440000001523314502402506014451 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Danger zone: low-level pointer operations #' #' The [nanoarrow_schema][as_nanoarrow_schema], #' [nanoarrow_array][as_nanoarrow_array], #' and [nanoarrow_array_stream][as_nanoarrow_array_stream] classes are #' represented in R as external pointers (`EXTPTRSXP`). When these objects #' go out of scope (i.e., when they are garbage collected or shortly #' thereafter), the underlying object's `release()` callback is called if #' the underlying pointer is non-null and if the `release()` callback is #' non-null. #' #' When interacting with other C Data Interface implementations, it is #' important to keep in mind that the R object wrapping these pointers is #' always passed by reference (because it is an external pointer) and may #' be referred to by another R object (e.g., an element in a `list()` or as a #' variable assigned in a user's environment). When importing a schema, #' array, or array stream into nanoarrow this is not a problem: the R object #' takes ownership of the lifecycle and memory is released when the R #' object is garbage collected. In this case, one can use #' [nanoarrow_pointer_move()] where `ptr_dst` was created using #' `nanoarrow_allocate_*()`. #' #' The case of exporting is more complicated and as such has a dedicated #' function, [nanoarrow_pointer_export()], that implements different logic #' schemas, arrays, and array streams: #' #' - Schema objects are (deep) copied such that a fresh copy of the schema #' is exported and made the responsibility of some other C data interface #' implementation. #' - Array objects are exported as a shell around the original array that #' preserves a reference to the R object. This ensures that the buffers #' and children pointed to by the array are not copied and that any references #' to the original array are not invalidated. #' - Array stream objects are moved: the responsibility for the object is #' transferred to the other C data interface implementation and any #' references to the original R object are invalidated. Because these #' objects are mutable, this is typically what you want (i.e., you should #' not be pulling arrays from a stream accidentally from two places). #' #' If you know the lifecycle of your object (i.e., you created the R object #' yourself and never passed references to it elsewhere), you can slightly #' more efficiently call [nanoarrow_pointer_move()] for all three pointer #' types. #' #' @param ptr,ptr_src,ptr_dst An external pointer to a `struct ArrowSchema`, #' `struct ArrowArray`, or `struct ArrowArrayStream`. #' @param protected An object whose scope must outlive that of `ptr`. This is #' useful for array streams since at least two specifications involving the #' array stream specify that the stream is only valid for the lifecycle of #' another object (e.g., an AdbcStatement or OGRDataset). #' @return #' - `nanoarrow_pointer_is_valid()` returns TRUE if the pointer is non-null #' and has a non-null release callback. #' - `nanoarrow_pointer_addr_dbl()` and `nanoarrow_pointer_addr_chr()` return #' pointer representations that may be helpful to facilitate moving or #' exporting nanoarrow objects to other libraries. #' - `nanoarrow_pointer_addr_pretty()` gives a pointer representation suitable #' for printing or error messages. #' - `nanoarrow_pointer_release()` returns `ptr`, invisibly. #' - `nanoarrow_pointer_move()` and `nanoarrow_pointer_export()` reeturn #' `ptr_dst`, invisibly. #' - `nanoarrow_allocate_array()`, `nanoarrow_allocate_schema()`, and #' `nanoarrow_allocate_array_stream()` return an #' [array][as_nanoarrow_array], a [schema][as_nanoarrow_schema], and an #' [array stream][as_nanoarrow_array_stream], respectively. #' @export #' nanoarrow_pointer_is_valid <- function(ptr) { .Call(nanoarrow_c_pointer_is_valid, ptr) } #' @rdname nanoarrow_pointer_is_valid #' @export nanoarrow_pointer_addr_dbl <- function(ptr) { .Call(nanoarrow_c_pointer_addr_dbl, ptr) } #' @rdname nanoarrow_pointer_is_valid #' @export nanoarrow_pointer_addr_chr <- function(ptr) { .Call(nanoarrow_c_pointer_addr_chr, ptr) } #' @rdname nanoarrow_pointer_is_valid #' @export nanoarrow_pointer_addr_pretty <- function(ptr) { .Call(nanoarrow_c_pointer_addr_pretty, ptr) } #' @rdname nanoarrow_pointer_is_valid #' @export nanoarrow_pointer_release <- function(ptr) { invisible(.Call(nanoarrow_c_pointer_release, ptr)) } #' @rdname nanoarrow_pointer_is_valid #' @export nanoarrow_pointer_move <- function(ptr_src, ptr_dst) { invisible(.Call(nanoarrow_c_pointer_move, ptr_src, ptr_dst)) } #' @rdname nanoarrow_pointer_is_valid #' @export nanoarrow_pointer_export <- function(ptr_src, ptr_dst) { if (inherits(ptr_src, "nanoarrow_schema")) { .Call(nanoarrow_c_export_schema, ptr_src, ptr_dst) } else if (inherits(ptr_src, "nanoarrow_array")) { .Call(nanoarrow_c_export_array, ptr_src, ptr_dst) } else if (inherits(ptr_src, "nanoarrow_array_stream")) { .Call(nanoarrow_c_export_array_stream, ptr_src, ptr_dst) } else { stop( "`ptr_src` must inherit from 'nanoarrow_schema', 'nanoarrow_array', or 'nanoarrow_array_stream'" ) } invisible(ptr_dst) } #' @rdname nanoarrow_pointer_is_valid #' @export nanoarrow_allocate_schema <- function() { .Call(nanoarrow_c_allocate_schema) } #' @rdname nanoarrow_pointer_is_valid #' @export nanoarrow_allocate_array <- function() { .Call(nanoarrow_c_allocate_array) } #' @rdname nanoarrow_pointer_is_valid #' @export nanoarrow_allocate_array_stream <- function() { .Call(nanoarrow_c_allocate_array_stream) } #' @rdname nanoarrow_pointer_is_valid #' @export nanoarrow_pointer_set_protected <- function(ptr_src, protected) { if (!inherits(ptr_src, c("nanoarrow_schema", "nanoarrow_array", "nanoarrow_array_stream"))) { stop( "`ptr_src` must inherit from 'nanoarrow_schema', 'nanoarrow_array', or 'nanoarrow_array_stream'" ) } .Call(nanoarrow_c_pointer_set_protected, ptr_src, protected) invisible(ptr_src) } nanoarrow/R/as-array.R0000644000176200001440000003001114547061553014330 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' @export as_nanoarrow_array.default <- function(x, ..., schema = NULL, .from_c = FALSE) { # If we're coming from C it's because we've tried all the internal conversions # and no suitable S3 method was found or the x--schema combination is not # implemented in nanoarrow. Try arrow::as_arrow_array(). if (.from_c) { # Give extension types a chance to handle conversion parsed <- .Call(nanoarrow_c_schema_parse, schema) if (!is.null(parsed$extension_name)) { spec <- resolve_nanoarrow_extension(parsed$extension_name) return(as_nanoarrow_array_extension(spec, x, ..., schema = schema)) } assert_arrow_installed( sprintf( "create %s array from object of type %s", nanoarrow_schema_formatted(schema), paste0(class(x), collapse = "/") ) ) result <- as_nanoarrow_array( arrow::as_arrow_array( x, type = arrow::as_data_type(schema) ) ) # Skip nanoarrow_pointer_export() for these arrays since we know there # are no external references to them class(result) <- c("nanoarrow_array_dont_export", class(result)) return(result) } if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } else { schema <- as_nanoarrow_schema(schema) } .Call(nanoarrow_c_as_array_default, x, schema) } #' @export as_nanoarrow_array.nanoarrow_array <- function(x, ..., schema = NULL) { if (is.null(schema)) { return(x) } inferred_schema <- infer_nanoarrow_schema(x) if (nanoarrow_schema_identical(schema, inferred_schema)) { return(x) } NextMethod() } #' @export as_nanoarrow_array.integer64 <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } schema <- as_nanoarrow_schema(schema) parsed <- nanoarrow_schema_parse(schema) if (!is.null(parsed$extension_name)) { spec <- resolve_nanoarrow_extension(parsed$extension_name) return(as_nanoarrow_array_extension(spec, x, ..., schema = schema)) } switch( parsed$type, int64 = , uint64 = { if (anyNA(x)) { is_valid_lgl <- is.finite(x) is_valid <- as_nanoarrow_array(is_valid_lgl, schema = na_bool())$buffers[[2]] na_count <- length(x) - sum(is_valid_lgl) } else { is_valid <- NULL na_count <- 0 } array <- nanoarrow_array_init(schema) nanoarrow_array_modify( array, list( length = length(x), null_count = na_count, buffers = list(is_valid, x) ) ) }, as_nanoarrow_array(as.double(x), schema = schema) ) } #' @export as_nanoarrow_array.POSIXct <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } schema <- as_nanoarrow_schema(schema) parsed <- nanoarrow_schema_parse(schema) if (!is.null(parsed$extension_name)) { spec <- resolve_nanoarrow_extension(parsed$extension_name) return(as_nanoarrow_array_extension(spec, x, ..., schema = schema)) } switch( parsed$type, timestamp = , duration = { multipliers <- c(s = 1.0, ms = 1e3, us = 1e6, ns = 1e9) multiplier <- unname(multipliers[parsed$time_unit]) array <- as_nanoarrow_array( as.numeric(x) * multiplier, schema = na_type(parsed$storage_type) ) nanoarrow_array_set_schema(array, schema) array }, NextMethod() ) } #' @export as_nanoarrow_array.difftime <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } schema <- as_nanoarrow_schema(schema) parsed <- nanoarrow_schema_parse(schema) if (!is.null(parsed$extension_name)) { spec <- resolve_nanoarrow_extension(parsed$extension_name) return(as_nanoarrow_array_extension(spec, x, ..., schema = schema)) } src_unit <- attr(x, "units") switch( parsed$type, time32 = , time64 = , duration = { multipliers <- c(s = 1.0, ms = 1e3, us = 1e6, ns = 1e9) src_multipliers <- c( secs = 1.0, mins = 60.0, hours = 3600.0, days = 86400.0, weeks = 604800.0 ) multiplier <- unname(multipliers[parsed$time_unit]) * unname(src_multipliers[src_unit]) array <- as_nanoarrow_array( as.numeric(x) * multiplier, schema = na_type(parsed$storage_type) ) nanoarrow_array_set_schema(array, schema) array }, NextMethod() ) } #' @export as_nanoarrow_array.blob <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } as_nanoarrow_array(unclass(x), schema = schema) } #' @export as_nanoarrow_array.data.frame <- function(x, ..., schema = NULL) { # We need to override this to prevent the list implementation from handling it as_nanoarrow_array.default(x, ..., schema = schema) } #' @export as_nanoarrow_array.list <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } schema <- as_nanoarrow_schema(schema) parsed <- nanoarrow_schema_parse(schema) if (!is.null(parsed$extension_name) || parsed$type != "list") { return(NextMethod()) } # This R implementation can't handle complex nesting if (startsWith(schema$children[[1]]$format, "+")) { return(NextMethod()) } array <- nanoarrow_array_init(schema) child <- unlist(x, recursive = FALSE, use.names = FALSE) if (is.null(child)) { child_array <- as_nanoarrow_array.vctrs_unspecified(logical(), schema = na_na()) } else { child_array <- as_nanoarrow_array(child, schema = schema$children[[1]]) } offsets <- c(0L, cumsum(lengths(x))) is_na <- vapply(x, is.null, logical(1)) validity <- as_nanoarrow_array(!is_na)$buffers[[2]] nanoarrow_array_modify( array, list( length = length(x), null_count = sum(is_na), buffers = list( validity, offsets ), children = list( child_array ) ) ) } #' @export as_nanoarrow_array.Date <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } schema <- as_nanoarrow_schema(schema) parsed <- nanoarrow_schema_parse(schema) if (!is.null(parsed$extension_name)) { spec <- resolve_nanoarrow_extension(parsed$extension_name) return(as_nanoarrow_array_extension(spec, x, ..., schema = schema)) } switch( parsed$type, date32 = { storage <- as_nanoarrow_array( as.integer(x), schema = na_type(parsed$storage_type) ) nanoarrow_array_set_schema(storage, schema) storage }, date64 = { storage <- as_nanoarrow_array( as.numeric(x) * 86400000, schema = na_type(parsed$storage_type) ) nanoarrow_array_set_schema(storage, schema) storage }, NextMethod() ) } #' @export as_nanoarrow_array.POSIXlt <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } schema <- as_nanoarrow_schema(schema) parsed <- nanoarrow_schema_parse(schema) if (!is.null(parsed$extension_name)) { spec <- resolve_nanoarrow_extension(parsed$extension_name) return(as_nanoarrow_array_extension(spec, x, ..., schema = schema)) } as_nanoarrow_array(new_data_frame(x, length(x)), schema = schema) } #' @export as_nanoarrow_array.factor <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } schema <- as_nanoarrow_schema(schema) parsed <- nanoarrow_schema_parse(schema) if (!is.null(parsed$extension_name)) { spec <- resolve_nanoarrow_extension(parsed$extension_name) return(as_nanoarrow_array_extension(spec, x, ..., schema = schema)) } if (is.null(schema$dictionary)) { return(as_nanoarrow_array(as.character(x), schema = schema)) } storage <- schema storage$dictionary <- NULL array <- as_nanoarrow_array(unclass(x) - 1L, schema = storage) array$dictionary <- as_nanoarrow_array(levels(x), schema = schema$dictionary) array } #' @export as_nanoarrow_array.vctrs_unspecified <- function(x, ..., schema = NULL) { if (is.null(schema)) { schema <- infer_nanoarrow_schema(x) } else { schema <- as_nanoarrow_schema(schema) } schema <- as_nanoarrow_schema(schema) parsed <- nanoarrow_schema_parse(schema) if (!is.null(parsed$extension_name)) { spec <- resolve_nanoarrow_extension(parsed$extension_name) return(as_nanoarrow_array_extension(spec, x, ..., schema = schema)) } switch( parsed$storage_type, na = { array <- nanoarrow_array_init(schema) array$length <- length(x) array$null_count <- length(x) array }, NextMethod() ) } # Called from C to create a union array when requested. # There are other types of objects that might make sense to # convert to a union but we basically just need enough to # for testing at this point. union_array_from_data_frame <- function(x, schema) { if (length(x) == 0 || length(x) > 127) { stop( sprintf( "Can't convert data frame with %d columns to union array", length(x) ) ) } # Compute NAs x_is_na <- do.call("cbind", lapply(x, is.na)) # Make sure we only have one non-NA value per row to make sure we don't drop # values stopifnot(all(rowSums(!x_is_na) <= 1)) child_index <- rep_len(0L, nrow(x)) seq_x <- seq_along(x) for (i in seq_along(child_index)) { for (j in seq_x) { if (!x_is_na[i, j]) { child_index[i] <- j - 1L break; } } } switch( nanoarrow_schema_parse(schema)$storage_type, "dense_union" = { is_child <- lapply(seq_x - 1L, "==", child_index) child_offset_each <- lapply(is_child, function(x) cumsum(x) - 1L) child_offset <- lapply(seq_along(child_index), function(i) { child_offset_each[[child_index[i] + 1]][i] }) children <- Map("[", x, is_child, drop = FALSE) names(children) <- names(schema$children) array <- nanoarrow_array_init(schema) nanoarrow_array_modify( array, list( length = length(child_index), null_count = 0, buffers = list(as.raw(child_index), as.integer(child_offset)), children = children ) ) }, "sparse_union" = { struct_schema <- na_struct(schema$children) array <- as_nanoarrow_array(x, array = struct_schema) array <- nanoarrow_array_modify( array, list(buffers = list(as.raw(child_index))), validate = FALSE ) nanoarrow_array_set_schema(array, schema, validate = TRUE) array }, stop("Attempt to create union from non-union array type") ) } # This is defined because it's verbose to pass named arguments from C. # When converting data frame columns, we try the internal C conversions # first to save R evaluation overhead. When the internal conversions fail, # we call as_nanoarrow_array() to dispatch to conversions defined via S3 # dispatch, making sure to let the default method know that we've already # tried the internal C conversions. as_nanoarrow_array_from_c <- function(x, schema) { result <- as_nanoarrow_array(x, schema = schema, .from_c = TRUE) # Anything we get from an S3 method we need to validate (even from the # arrow package, which occasionally does not honour the schema argument) nanoarrow_array_set_schema(result, schema, validate = TRUE) result } nanoarrow/R/extension-vctrs.R0000644000176200001440000000733014502402562015762 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Vctrs extension type #' #' The Arrow format provides a rich type system that can handle most R #' vector types; however, many R vector types do not roundtrip perfectly #' through Arrow memory. The vctrs extension type uses [vctrs::vec_data()], #' [vctrs::vec_restore()], and [vctrs::vec_ptype()] in calls to #' [as_nanoarrow_array()] and [convert_array()] to ensure roundtrip fidelity. #' #' @param ptype A vctrs prototype as returned by [vctrs::vec_ptype()]. #' The prototype can be of arbitrary size, but a zero-size vector #' is sufficient here. #' @inheritParams na_type #' #' @return A [nanoarrow_schema][as_nanoarrow_schema]. #' @export #' #' @examples #' vctr <- as.POSIXlt("2000-01-02 03:45", tz = "UTC") #' array <- as_nanoarrow_array(vctr, schema = na_vctrs(vctr)) #' infer_nanoarrow_ptype(array) #' convert_array(array) #' na_vctrs <- function(ptype, storage_type = NULL) { ptype <- vctrs::vec_ptype(ptype) if (is.null(storage_type)) { storage_type <- infer_nanoarrow_schema(vctrs::vec_data(ptype)) } # Note: a potential replacement for this is the JSON generated by the cereal # package; however, as of this writing that JSON doesn't handle arbitrary nesting. # The arrow package currently uses the non-ASCII version; however, it generally # makes life easier if the metadata is valid UTF-8. The deserializer works with # either. na_extension(storage_type, "arrow.r.vctrs", serialize(ptype, NULL, ascii = TRUE)) } register_vctrs_extension <- function() { register_nanoarrow_extension( "arrow.r.vctrs", nanoarrow_extension_spec(subclass = "nanoarrow_extension_spec_vctrs") ) } #' @export infer_nanoarrow_ptype_extension.nanoarrow_extension_spec_vctrs <- function(extension_spec, x, ...) { parsed <- .Call(nanoarrow_c_schema_parse, x) unserialize(parsed$extension_metadata) } #' @export convert_array_extension.nanoarrow_extension_spec_vctrs <- function(extension_spec, array, to, ...) { # Restore the vector data to the ptype that is serialized in the type metadata to_r_data <- infer_nanoarrow_ptype(array) to_data <- vctrs::vec_data(to_r_data) data <- convert_array_extension(NULL, array, to_data, warn_unregistered = FALSE) vctr <- vctrs::vec_restore(data, to_r_data) # Cast to `to` if a different ptype was requested if (!is.null(to)) { vctrs::vec_cast(vctr, to) } else { vctr } } #' @export as_nanoarrow_array_extension.nanoarrow_extension_spec_vctrs <- function( extension_spec, x, ..., schema = NULL) { storage_schema <- schema storage_schema$metadata[["ARROW:extension:name"]] <- NULL storage_schema$metadata[["ARROW:extension:metadata"]] <- NULL storage_array <- as_nanoarrow_array( vctrs::vec_data(x), schema = storage_schema ) nanoarrow_extension_array( storage_array, "arrow.r.vctrs", schema$metadata[["ARROW:extension:metadata"]] ) } nanoarrow/R/schema.R0000644000176200001440000002352414547061553014064 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Convert an object to a nanoarrow schema #' #' In nanoarrow a 'schema' refers to a `struct ArrowSchema` as defined in the #' Arrow C Data interface. This data structure can be used to represent an #' [arrow::schema()], an [arrow::field()], or an `arrow::DataType`. Note that #' in nanoarrow, an [arrow::schema()] and a non-nullable [arrow::struct()] #' are represented identically. #' #' @param x An object to convert to a schema #' @param recursive Use `TRUE` to include a `children` member when parsing #' schemas. #' @param new_values New schema component to assign #' @param validate Use `FALSE` to skip schema validation #' @param ... Passed to S3 methods #' #' @return An object of class 'nanoarrow_schema' #' @export #' #' @examples #' infer_nanoarrow_schema(integer()) #' infer_nanoarrow_schema(data.frame(x = integer())) #' as_nanoarrow_schema <- function(x, ...) { UseMethod("as_nanoarrow_schema") } #' @export as_nanoarrow_schema.nanoarrow_schema <- function(x, ...) { x } #' @rdname as_nanoarrow_schema #' @export infer_nanoarrow_schema <- function(x, ...) { UseMethod("infer_nanoarrow_schema") } #' @export infer_nanoarrow_schema.default <- function(x, ...) { cls <- paste(class(x), collapse = "/") stop(sprintf("Can't infer Arrow type for object of class %s", cls)) } #' @export infer_nanoarrow_schema.raw <- function(x, ...) { na_uint8() } #' @export infer_nanoarrow_schema.logical <- function(x, ...) { na_bool() } #' @export infer_nanoarrow_schema.integer <- function(x, ...) { na_int32() } #' @export infer_nanoarrow_schema.double <- function(x, ...) { na_double() } #' @export infer_nanoarrow_schema.character <- function(x, ...) { if (length(x) > 0 && sum(nchar(x, type = "bytes"), na.rm = TRUE) > .Machine$integer.max) { na_large_string() } else { na_string() } } #' @export infer_nanoarrow_schema.integer64 <- function(x, ...) { na_int64() } #' @export infer_nanoarrow_schema.factor <- function(x, ...) { na_dictionary( infer_nanoarrow_schema(levels(x)), na_int32(), ordered = is.ordered(x) ) } #' @export infer_nanoarrow_schema.POSIXct <- function(x, ...) { tz <- attr(x, "tzone") if (is.null(tz) || identical(tz, "")) { tz <- Sys.timezone() } na_timestamp(timezone = tz) } #' @export infer_nanoarrow_schema.POSIXlt <- function(x, ...) { infer_nanoarrow_schema(new_data_frame(x, length(x))) } #' @export infer_nanoarrow_schema.Date <- function(x, ...) { na_date32() } #' @export infer_nanoarrow_schema.difftime <- function(x, ...) { # A balance between safety for large time ranges (not overflowing) # and safety for small time ranges (not truncating) na_duration(unit = "us") } #' @export infer_nanoarrow_schema.data.frame <- function(x, ...) { na_struct(lapply(x, infer_nanoarrow_schema), nullable = FALSE) } #' @export infer_nanoarrow_schema.hms <- function(x, ...) { # As a default, ms is safer than s and less likely to truncate na_time32(unit = "ms") } #' @export infer_nanoarrow_schema.blob <- function(x, ...) { if (length(x) > 0 && sum(lengths(x)) > .Machine$integer.max) { na_large_binary() } else { na_binary() } } #' @export infer_nanoarrow_schema.vctrs_unspecified <- function(x, ...) { na_na() } #' @export infer_nanoarrow_schema.vctrs_list_of <- function(x, ...) { child_type <- infer_nanoarrow_schema(attr(x, "ptype")) if (length(x) > 0 && sum(lengths(x)) > .Machine$integer.max) { na_large_list(child_type) } else { na_list(child_type) } } #' @export infer_nanoarrow_schema.AsIs <- function(x, ...) { # NextMethod() goes directly to `default` class(x) <- class(x)[-1] infer_nanoarrow_schema(x) } #' @export infer_nanoarrow_schema.list <- function(x, ...) { # TODO: Move this to C is_null <- vapply(x, is.null, logical(1)) if (all(is_null)) { return(na_list(na_na())) } is_raw <- vapply(x, is.raw, logical(1)) if (!all(is_raw | is_null)) { return(NextMethod()) } if (length(x) > 0 && sum(lengths(x)) > .Machine$integer.max) { na_large_binary() } else { na_binary() } } #' @rdname as_nanoarrow_schema #' @export nanoarrow_schema_parse <- function(x, recursive = FALSE) { parsed <- .Call(nanoarrow_c_schema_parse, as_nanoarrow_schema(x)) parsed_null <- vapply(parsed, is.null, logical(1)) result <- parsed[!parsed_null] if (recursive && length(x$children) > 0) { result$children <- lapply(x$children, nanoarrow_schema_parse, TRUE) } result } #' @rdname as_nanoarrow_schema #' @export nanoarrow_schema_modify <- function(x, new_values, validate = TRUE) { schema <- as_nanoarrow_schema(x) if (length(new_values) == 0) { return(schema) } # Make sure new_values has names to iterate over new_names <- names(new_values) if (is.null(new_names) || all(new_names == "", na.rm = TRUE)) { stop("`new_values` must be named") } # Make a deep copy and modify it. Possibly not as efficient as it could be # but it's unclear to what degree performance is an issue for R-level # schema modification. schema_deep_copy <- nanoarrow_allocate_schema() nanoarrow_pointer_export(schema, schema_deep_copy) for (i in seq_along(new_values)) { nm <- new_names[i] value <- new_values[[i]] switch( nm, format = .Call( nanoarrow_c_schema_set_format, schema_deep_copy, as.character(value) ), name = { if (!is.null(value)) { value <- as.character(value) } .Call(nanoarrow_c_schema_set_name, schema_deep_copy, value) }, flags = .Call( nanoarrow_c_schema_set_flags, schema_deep_copy, as.integer(value) ), metadata = .Call( nanoarrow_c_schema_set_metadata, schema_deep_copy, as.list(value) ), children = { if (!is.null(value)) { value <- lapply(value, as_nanoarrow_schema) } .Call(nanoarrow_c_schema_set_children, schema_deep_copy, value) }, dictionary = { if (!is.null(value)) { value <- as_nanoarrow_schema(value) } .Call(nanoarrow_c_schema_set_dictionary, schema_deep_copy, value) }, stop(sprintf("Can't modify schema[[%s]]: does not exist", deparse(nm))) ) } if (validate) { nanoarrow_schema_parse(schema_deep_copy, recursive = FALSE) } schema_deep_copy } nanoarrow_schema_identical <- function(x, y) { identical(x, y) || identical( nanoarrow_schema_proxy(x, recursive = TRUE), nanoarrow_schema_proxy(y, recursive = TRUE) ) } #' @importFrom utils str #' @export str.nanoarrow_schema <- function(object, ...) { cat(sprintf("%s\n", format(object, .recursive = FALSE))) if (nanoarrow_pointer_is_valid(object)) { # Use the str() of the list version but remove the first # line of the output ("List of 6") info <- nanoarrow_schema_proxy(object) raw_str_output <- utils::capture.output(str(info, ...)) cat(paste0(raw_str_output[-1], collapse = "\n")) cat("\n") } invisible(object) } #' @export print.nanoarrow_schema <- function(x, ...) { str(x, ...) invisible(x) } #' @export format.nanoarrow_schema <- function(x, ..., .recursive = TRUE) { sprintf( "", nanoarrow_schema_formatted(x, .recursive) ) } # This is the list()-like interface to nanoarrow_schema that allows $ and [[ # to make nice auto-complete for the schema fields #' @export length.nanoarrow_schema <- function(x, ...) { 6L } #' @export names.nanoarrow_schema <- function(x, ...) { c("format", "name", "metadata", "flags", "children", "dictionary") } #' @export `[[.nanoarrow_schema` <- function(x, i, ...) { nanoarrow_schema_proxy(x)[[i]] } #' @export `$.nanoarrow_schema` <- function(x, i, ...) { nanoarrow_schema_proxy(x)[[i]] } #' @export `[[<-.nanoarrow_schema` <- function(x, i, value) { if (is.numeric(i) && isTRUE(i %in% 1:6)) { i <- names.nanoarrow_schema()[[i]] } if (is.character(i) && (length(i) == 1L) && !is.na(i)) { new_values <- list(value) names(new_values) <- i return(nanoarrow_schema_modify(x, new_values)) } stop("`i` must be character(1) or integer(1) %in% 1:6") } #' @export `$<-.nanoarrow_schema` <- function(x, i, value) { new_values <- list(value) names(new_values) <- i nanoarrow_schema_modify(x, new_values) } nanoarrow_schema_formatted <- function(x, recursive = TRUE) { .Call(nanoarrow_c_schema_format, x, as.logical(recursive)[1]) } nanoarrow_schema_proxy <- function(schema, recursive = FALSE) { result <- .Call(nanoarrow_c_schema_to_list, schema) if (recursive && !is.null(schema$children)) { result$children <- lapply( schema$children, nanoarrow_schema_proxy, recursive = TRUE ) } if (recursive && !is.null(schema$dictionary)) { result$dictionary <- nanoarrow_schema_proxy(schema$dictionary, recursive = TRUE) } result$metadata <- list_of_raw_to_metadata(result$metadata) result } list_of_raw_to_metadata <- function(metadata) { lapply(metadata, function(x) { if (is.character(x) || any(x == 0)) { x } else { x_str <- iconv(list(x), from = "UTF-8", to = "UTF-8", mark = TRUE)[[1]] if (is.na(x_str)) x else x_str } }) } nanoarrow/R/altrep.R0000644000176200001440000000232114377444470014107 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # For testing the altrep chr conversion nanoarrow_altrep_chr <- function(array) { .Call(nanoarrow_c_make_altrep_chr, array) } is_nanoarrow_altrep <- function(x) { .Call(nanoarrow_c_is_altrep, x) } nanoarrow_altrep_force_materialize <- function(x, recursive = FALSE) { invisible(.Call(nanoarrow_c_altrep_force_materialize, x, recursive)) } is_nanoarrow_altrep_materialized <- function(x) { .Call(nanoarrow_c_altrep_is_materialized, x) } nanoarrow/R/pkg-arrow.R0000644000176200001440000002010014547061553014520 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # exported in zzz.R infer_type.nanoarrow_array <- function(x, ...) { arrow::as_data_type(infer_nanoarrow_schema(x, ...)) } as_data_type.nanoarrow_schema <- function(x, ...) { exportable_schema <- nanoarrow_allocate_schema() nanoarrow_pointer_export(x, exportable_schema) getFromNamespace("DataType", "arrow")$import_from_c(exportable_schema) } as_schema.nanoarrow_schema <- function(x, ...) { exportable_schema <- nanoarrow_allocate_schema() nanoarrow_pointer_export(x, exportable_schema) arrow::Schema$import_from_c(exportable_schema) } as_arrow_array.nanoarrow_array <- function(x, ..., type = NULL) { exportable_schema <- nanoarrow_allocate_schema() exportable_array <- nanoarrow_allocate_array() schema <- .Call(nanoarrow_c_infer_schema_array, x) nanoarrow_pointer_export(schema, exportable_schema) nanoarrow_pointer_export(x, exportable_array) result <- arrow::Array$import_from_c(exportable_array, exportable_schema) if (!is.null(type)) { result$cast(arrow::as_data_type(type)) } else { result } } as_arrow_array.nanoarrow_array_stream <- function(x, ..., type = NULL) { chunked <- as_chunked_array.nanoarrow_array_stream(x, ..., type = type) if (chunked$num_chunks == 1) { chunked$chunks[[1]] } else { arrow::as_arrow_array(chunked) } } as_chunked_array.nanoarrow_array <- function(x, ..., type = NULL) { arrow::as_chunked_array(as_arrow_array.nanoarrow_array(x, ..., type = type)) } as_chunked_array.nanoarrow_array_stream <- function(x, ..., type = NULL) { on.exit(x$release()) schema <- infer_nanoarrow_schema(x) chunks <- collect_array_stream(x, validate = FALSE) arrow::ChunkedArray$create(!!!chunks, type = arrow::as_data_type(schema)) } as_record_batch.nanoarrow_array <- function(x, ..., schema = NULL) { exportable_schema <- nanoarrow_allocate_schema() exportable_array <- nanoarrow_allocate_array() nanoarrow_pointer_export( .Call(nanoarrow_c_infer_schema_array, x), exportable_schema ) nanoarrow_pointer_export(x, exportable_array) result <- arrow::RecordBatch$import_from_c(exportable_array, exportable_schema) if (!is.null(schema)) { arrow::as_record_batch(result, schema = schema) } else { result } } as_arrow_table.nanoarrow_array <- function(x, ..., schema = NULL) { arrow::as_arrow_table( as_record_batch.nanoarrow_array(x, schema = schema) ) } as_arrow_table.nanoarrow_array_stream <- function(x, ..., schema = NULL) { on.exit(x$release()) table <- arrow::as_arrow_table(as_record_batch_reader.nanoarrow_array_stream(x)) if (!is.null(schema)) { table$cast(arrow::as_schema(schema)) } else { table } } as_record_batch_reader.nanoarrow_array_stream <- function(x, ..., schema = NULL) { # TODO: not supporting an explicit schema here yet stopifnot(is.null(schema)) # Export stream to ensure self-containedness stream_out <- nanoarrow::nanoarrow_allocate_array_stream() nanoarrow_pointer_export(x, stream_out) arrow::RecordBatchReader$import_from_c(stream_out) } #' @export as_nanoarrow_schema.DataType <- function(x, ...) { schema <- nanoarrow_allocate_schema() x$export_to_c(schema) schema } #' @export as_nanoarrow_schema.Field <- function(x, ...) { schema <- nanoarrow_allocate_schema() x$export_to_c(schema) schema } #' @export as_nanoarrow_schema.Schema <- function(x, ...) { schema <- nanoarrow_allocate_schema() x$export_to_c(schema) schema } #' @export infer_nanoarrow_schema.Array <- function(x, ...) { as_nanoarrow_schema.DataType(x$type) } #' @export infer_nanoarrow_schema.Scalar <- function(x, ...) { as_nanoarrow_schema.DataType(x$type) } #' @export infer_nanoarrow_schema.Expression <- function(x, ...) { as_nanoarrow_schema.DataType(x$type()) } #' @export infer_nanoarrow_schema.ChunkedArray <- function(x, ...) { as_nanoarrow_schema.DataType(x$type) } #' @export infer_nanoarrow_schema.ArrowTabular <- function(x, ...) { as_nanoarrow_schema.Schema(x$schema) } #' @export infer_nanoarrow_schema.RecordBatchReader <- function(x, ...) { as_nanoarrow_schema.Schema(x$schema) } #' @export infer_nanoarrow_schema.Dataset <- function(x, ...) { as_nanoarrow_schema.Schema(x$schema) } #' @export infer_nanoarrow_schema.Scanner <- function(x, ...) { as_nanoarrow_schema.Schema(x$schema) } #' @export infer_nanoarrow_schema.arrow_dplyr_query <- function(x, ...) { infer_nanoarrow_schema.RecordBatchReader(arrow::as_record_batch_reader(x)) } #' @export as_nanoarrow_array.Array <- function(x, ..., schema = NULL) { imported_schema <- nanoarrow_allocate_schema() array <- nanoarrow_allocate_array() if (!is.null(schema)) { x <- x$cast(arrow::as_data_type(schema)) } x$export_to_c(array, imported_schema) nanoarrow_array_set_schema(array, imported_schema) array } #' @export as_nanoarrow_array.ChunkedArray <- function(x, ..., schema = NULL) { if (is.null(schema)) { array <- arrow::as_arrow_array(x) } else { array <- arrow::as_arrow_array(x, type = arrow::as_data_type(schema)) } as_nanoarrow_array.Array(array) } #' @export as_nanoarrow_array.RecordBatch <- function(x, ..., schema = NULL) { imported_schema <- nanoarrow_allocate_schema() array <- nanoarrow_allocate_array() if (!is.null(schema)) { x <- x$cast(arrow::as_schema(schema)) } x$export_to_c(array, imported_schema) nanoarrow_array_set_schema(array, imported_schema) array } #' @export as_nanoarrow_array.Table <- function(x, ..., schema = NULL) { if (is.null(schema)) { batch <- arrow::as_record_batch(x) } else { batch <- arrow::as_record_batch(x, schema = arrow::as_schema(schema)) } as_nanoarrow_array.RecordBatch(batch) } #' @export as_nanoarrow_array_stream.RecordBatchReader <- function(x, ..., schema = NULL) { # TODO: not supporting an explicit schema here yet stopifnot(is.null(schema)) array_stream <- nanoarrow_allocate_array_stream() x$export_to_c(array_stream) array_stream } #' @export as_nanoarrow_array_stream.ArrowTabular <- function(x, ..., schema = NULL) { if (!is.null(schema)) { x <- x$cast(arrow::as_schema(schema)) } as_nanoarrow_array_stream.RecordBatchReader(arrow::as_record_batch_reader(x)) } #' @export as_nanoarrow_array_stream.Dataset <- function(x, ..., schema = NULL) { as_nanoarrow_array_stream.RecordBatchReader( arrow::as_record_batch_reader(x), ..., schema = schema ) } #' @export as_nanoarrow_array_stream.arrow_dplyr_query <- function(x, ..., schema = NULL) { as_nanoarrow_array_stream.RecordBatchReader( arrow::as_record_batch_reader(x), ..., schema = schema ) } #' @export as_nanoarrow_array_stream.Scanner <- function(x, ..., schema = NULL) { as_nanoarrow_array_stream.RecordBatchReader( arrow::as_record_batch_reader(x), ..., schema = schema ) } #' @export as_nanoarrow_array_stream.ChunkedArray <- function(x, ..., schema = NULL) { if (!is.null(schema)) { x <- x$cast(arrow::as_data_type(schema)) } schema <- as_nanoarrow_schema.DataType(x$type) # Could be more efficient (involves an S3 dispatch + export for each chunk) basic_array_stream(x$chunks, schema = schema, validate = FALSE) } #' @export as_nanoarrow_array_stream.Array <- function(x, ..., schema = NULL) { if (!is.null(schema)) { x <- x$cast(arrow::as_data_type(schema)) } schema <- as_nanoarrow_schema.DataType(x$type) basic_array_stream(list(x), schema = schema, validate = FALSE) } nanoarrow/NEWS.md0000644000176200001440000000557614556775536013422 0ustar liggesusers # nanoarrow 0.4.0 - Fix source links from pkgdown site (#315). - Provide LinkingTo headers for extension packages (#332). - Add more `nanoarrow_array_stream` generics (#349). - Add conversion from integer type to `character()` (#345). - Ensure simple `list()`s can be converted without arrow installed (#344). # nanoarrow 0.3.0.1 - Ensure wrapper array stream eagerly releases the wrapped array stream (#333). # nanoarrow 0.3.0 - Use classed warnings to signal that a lossy conversion occurred (#298) - Add support for `bit64::integer64()` conversions (#293) - Implement extension type registration/conversion (#288) - Implement dictionary conversion (#285) - Ensure `ordered` is reflected in `na_dictionary()` (#299) - Warn for possibly out of range int64 -> double conversions (#294) - Support map conversion to R vector (#282) - Don't link to arrow package R6 class pages (#269) - Use `basic_array_stream()` to improve array stream to data.frame conversion (#279) # nanoarrow 0.2.0-1 - Don't link to arrow package R6 class pages (#269) # nanoarrow 0.2.0 ## New features - Improve printing and conversion of buffers (#208) - Add `enum ArrowType buffer_data_type` member to `struct ArrowLayout` (#207) - Implement ListChildOffset function (#197) - Add ability to deterministically run a finalizer on an array stream (#196) - Union array support (#195) - Add ArrowArrayStream implementation to support keeping a dependent object in scope (#194) - Add `as_nanoarrow_array()` implementation that does not fall back on `arrow::as_arrow_array()` everywhere (#108) - Create nanoarrow_array objects from buffers (#105) - Implement infer schema methods (#104) - Create and modify nanoarrow_schema objects (#101) ## Bugfixes - Fix `convert_array_stream()` for non-record batch stream with zero batches (#212) - clear `release` in `EmptyArrayStream::release_wrapper` (#204) - Release streams when calling `as.vector()` or `as.data.frame()` (#202) - Don't invoke undefined behaviour in conversions to/from Arrow (#167) - Use strict prototypes in all internal C functions (#151) - Don't memcpy NULL when converting buffer to raw (#149) nanoarrow/MD50000644000176200001440000001321614557000642012576 0ustar liggesusers70cd613601aee193df315705844d994c *DESCRIPTION a56108a60be881ae0a23370820776b20 *NAMESPACE 90a8f6cb4572fcf9db3c7fe6b056b40b *NEWS.md 3f4b9900b02b437eb56d26cbf22466fc *R/altrep.R da959399c29340892a633c8f6a9b4653 *R/array-stream.R 966c63d6fe83a3ab18e488de95d27287 *R/array.R 562dcb1a7487529242f601ab03ff02a4 *R/as-array.R 3bf6853b597287bd64dda0df6b9e3d72 *R/buffer.R 5dfde2e8099afe812764b20250530f9c *R/convert-array-stream.R 413147d8946080ebd5970945fcf86679 *R/convert-array.R 5aa088fc926e17643f51275d90f10a5f *R/extension-vctrs.R 0e798a3afa1f5b0e4c9555ba1b808a99 *R/extension.R 45d92e553cf36b96af2ead476bd121b4 *R/infer-ptype.R db26ada34d5262762a300cc2a3e26e10 *R/nanoarrow-package.R 22e5846999af4b2062552482a1387371 *R/pkg-arrow.R 132650b0d3b75350945c6b040b5ce483 *R/pointers.R 3e052f60c0fd58608d65528d76c3ca13 *R/schema.R e479ed3f31b77a7d47de421fdbea6c8d *R/type.R 06270acd344a99cd3d1d06db5514dbfd *R/util.R 5752b83fec0a385e9c8298c688fa7552 *R/zzz.R ba1490eb4626847a71feac2e1ab6aea3 *README.md 8b2fa4425ae1a44f63f0b2e6a334bbf3 *configure d8b3e9c52c24ff34bd0f06cbe475812d *configure.win 99f618541e190abd57cbdf66cf9c85ea *inst/include/nanoarrow/r.h 7aaadb46d2e09fe111c89629b92e83af *man/array_stream_set_finalizer.Rd 4d587015438f353cb5b72675ae2ca659 *man/as_nanoarrow_array.Rd bd2f9b6181d29ed93358fb1f2d52e9dc *man/as_nanoarrow_array_stream.Rd d6a23e2f24251d5e404cf2d69b11e4b3 *man/as_nanoarrow_buffer.Rd 64371e8326850178c621394b6941f881 *man/as_nanoarrow_schema.Rd ab560cd52af1016a5aaccb5623533ec7 *man/basic_array_stream.Rd 1d60d9659218dac49c5f034af67910bb *man/convert_array.Rd 6e131087aca590558b26f2a79f30a4d2 *man/convert_array_stream.Rd 46525b8260de44f421c2768dff2f841c *man/infer_nanoarrow_ptype.Rd f07a675b459cdcf282e645af2ac6e524 *man/infer_nanoarrow_ptype_extension.Rd 87e9e8a851cfc7c85ae2923b2d7db8d1 *man/na_type.Rd 75e01324b45205813f9fadab4ca4df8f *man/na_vctrs.Rd 0e29c0c30f684c898cd9a069e3307ac9 *man/nanoarrow-package.Rd ea2719b6457c3dc1b270e3ac1d8c445a *man/nanoarrow_array_init.Rd ead3237b06c2e034738de27f68625336 *man/nanoarrow_buffer_init.Rd cad85e570d72538909919fba3aa2de80 *man/nanoarrow_extension_array.Rd c653c8e32b8780d91701e42109565657 *man/nanoarrow_extension_spec.Rd 8c085033c3bfb38aa7e7bb5a2e2eec35 *man/nanoarrow_pointer_is_valid.Rd ad4e8783c1b88a84de24615fef1dd178 *man/nanoarrow_version.Rd 3bb6f346a17a5dfb4bf6e3f43dd3b151 *src/Makevars f1d9b2458107adf780e435b49aa2c34f *src/altrep.c 2d48b1bb763debda602979c4ef478594 *src/altrep.h 5e443ad5cd9e1b0604be1d4fc8dad84f *src/array.c 38269433b1f19a6ee9c93fba69aaab37 *src/array.h 8aa5cde175f2a226f4102ef723226055 *src/array_stream.c eb4b0c65b3e4210a8e336e4195366748 *src/array_stream.h 9a01a75c2877fcbea0e2a9a68d3ebdeb *src/array_view.c 10f66cefa23bfd607ec8c2f8f18ebafc *src/array_view.h 01602cba00f445f9675ace29d2d22954 *src/as_array.c 09beeb118bda494b49a7f5be09b571a4 *src/buffer.c 68dec2a47008d3ad8adaf3d30ca6b291 *src/buffer.h 4e433477f4412a6b2ba9a6a82799bf39 *src/convert.c 16f1f979c2e5d79fdfc20ccd07100cc8 *src/convert.h 376f88f291761406c65339af8874762b *src/convert_array.c 69a4e04645c13c8ec0c5f4503da7c729 *src/convert_array_stream.c 8b1d9bcd1fc36b48de3c0150001f778f *src/infer_ptype.c 673cb46b352c5dba6bed900eb01dcc9a *src/init.c 975ea7a13fbeb434b103343baa688d31 *src/materialize.c 454b45e293e1e7bc66a6d81e1b14b54d *src/materialize.h 86c1c1dcafa3f3241695636cbbe1ed94 *src/materialize_blob.h 7d3bbe82a5724a4a0195dc2eab8496dc *src/materialize_chr.h 48102a1913c7c22c508e120c3b22f4a9 *src/materialize_common.h e6dd97454fad97df3de78b64a6106f51 *src/materialize_date.h 4f8db462dc5bcf25ed7a42f2b9b82d3a *src/materialize_dbl.h 69d6d6d23f7794b6b167e6adc24bd866 *src/materialize_difftime.h 76f146bead6bd20aa96a7c94851db7fe *src/materialize_int.h 2937593e010e28b92096616ab8e114d1 *src/materialize_int64.h 2474e0e7f6c4002924b05a44dc4034f3 *src/materialize_lgl.h 9d547b11f10e00f9aff1f6943ffbef91 *src/materialize_posixct.h 04c963c561bab504b00e89efc623a965 *src/materialize_unspecified.h 61ceb0e8dd4ffc53163add70c0e11117 *src/nanoarrow.c da6a57b02892326e916482f65628701b *src/nanoarrow.h efc511533436f9f64fbb12b253bcba25 *src/nanoarrow_cpp.cc 9616650c204bf2f4421e91d66e2793e2 *src/pointers.c d99d1f5474e7f8b1689a0133ba1c4f03 *src/schema.c b6af4a24ba6939fa08e135699e7c0ab4 *src/schema.h b737e29ea62fbb615af995372204d115 *src/util.c 4be8009693d6bdaa7056ab3779215203 *src/util.h bd5ef54bce08c2a12d42a6c6b58f9ffa *src/version.c a78f805c4c8406d9012eb5ad2aada4e4 *tests/testthat.R 7b8b3ccde03c6dc39b3ede76c56f3532 *tests/testthat/_snaps/array-stream.md 669744f5bd7def919393275a0fa8fa14 *tests/testthat/_snaps/as-array.md 79ad070e5cd671a27d1620c7f4dde40a *tests/testthat/_snaps/buffer.md cdd56d9d91a8dfe2c7b9ae346ed7e1d8 *tests/testthat/test-altrep.R 9e34c828fa01d924cbc4b5e40451faf8 *tests/testthat/test-array-stream.R 2ed91bdaf43ab801da79a7e254fc4990 *tests/testthat/test-array.R 73c3c1eda962c1234ecb7f5021275b53 *tests/testthat/test-as-array.R 7401646835220a087f5fa95c788b0465 *tests/testthat/test-buffer.R b00603d1d72417d7e17b34ad6ee05455 *tests/testthat/test-convert-array-stream.R ea2397d8ca0f5aaadc8d7fc997393498 *tests/testthat/test-convert-array.R 5fa782ab40feda14c373e559a5566f41 *tests/testthat/test-extension-vctrs.R bef46721d102b541fa766b8d494d8cd0 *tests/testthat/test-extension.R 2c0761d46c9de72bfcef7379a0ebc8a7 *tests/testthat/test-infer-ptype.R 5d03084abff7f7f929713b591aa216ee *tests/testthat/test-nanoarrow-package.R f574a3c7a7566e2fe4c62528e12b72fe *tests/testthat/test-pkg-arrow.R bef36f06e21dfd2efb40c2432d46db70 *tests/testthat/test-pointers.R 1fd2cabedd6532aca8ee2ed59fbe9e74 *tests/testthat/test-schema.R a02e8d00d0cd68982b4382df40fcd72a *tests/testthat/test-type.R 17a59ff8c36b700a0f9edcfba0d3aa09 *tests/testthat/test-util.R 3d6615b7150f3ce50b8ec9ae3bc957bf *tools/make-callentries.R nanoarrow/inst/0000755000176200001440000000000014547575511013253 5ustar liggesusersnanoarrow/inst/include/0000755000176200001440000000000014547575511014676 5ustar liggesusersnanoarrow/inst/include/nanoarrow/0000755000176200001440000000000014547575511016704 5ustar liggesusersnanoarrow/inst/include/nanoarrow/r.h0000644000176200001440000003703014547575511017321 0ustar liggesusers// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef NANOARROW_R_H_INCLUDED #define NANOARROW_R_H_INCLUDED #include #include #include #ifdef __cplusplus extern "C" { #endif /// \defgroup nanoarrow-r Utilities for Arrow R extensions /// /// EXPERIMENTAL: The interface and lifecycle semantics described in this header /// should be considered experimental and may change in a future version based on /// user feedback. /// /// In the nanoarrow R package, an external pointer to an ArrowSchema, ArrowArray, or /// ArrowArrayStream carries the class "nanoarrow_schema", "nanoarrow_array", or /// "nanoarrow_array_stream" (respectively). The pointer must point to valid memory /// or be NULL until the R external pointer object is finalized. /// /// nanoarrow_schema_owning_xptr(), nanoarrow_array_owning_xptr(), and /// nanoarrow_array_stream_owning_xptr() initialize such an external pointer using /// malloc() and a NULL initial release() callback such that it can be distinguished from /// a pointer to an initialized value according to the Arrow C Data/Stream interface /// documentation. This structure is intended to have a valid value initialized into it /// using ArrowXXXMove() or by passing the pointer to a suitable exporting function. /// /// External pointers allocated by nanoarrow_xxxx_owning_xptr() register a finalizer /// that will call the release() callback when its value is non-NULL and points to /// a structure whose release() callback is also non-NULL. External pointers may also /// manage lifecycle by declaring a strong reference to a single R object via /// R_SetExternalPtrProtected(); however, when passing the address of an R external /// pointer to a non-R library, the ownership of the structure must *not* have such SEXP /// dependencies. The nanoarrow R package can wrap such an SEXP dependency into a /// self-contained thread-safe release callback via nanoarrow_pointer_export() that /// manages the SEXP dependency using a preserve/release mechanism similar to /// R_PreserveObject()/ R_ReleaseObject(). /// /// The "tag" of an external pointer to an ArrowArray must be R_NilValue or an external /// pointer to an ArrowSchema that may be used to interpret the pointed-to ArrowArray. The /// "tag" of a nanoarrow external pointer to an ArrowSchema or ArrowArrayStream is /// reserved for future use and must be R_NilValue. /// /// @{ // Extra guard for versions of Arrow without the canonical guard #ifndef ARROW_FLAG_DICTIONARY_ORDERED #ifndef ARROW_C_DATA_INTERFACE #define ARROW_C_DATA_INTERFACE #include #define ARROW_FLAG_DICTIONARY_ORDERED 1 #define ARROW_FLAG_NULLABLE 2 #define ARROW_FLAG_MAP_KEYS_SORTED 4 struct ArrowSchema { // Array type description const char* format; const char* name; const char* metadata; int64_t flags; int64_t n_children; struct ArrowSchema** children; struct ArrowSchema* dictionary; // Release callback void (*release)(struct ArrowSchema*); // Opaque producer-specific data void* private_data; }; struct ArrowArray { // Array data description int64_t length; int64_t null_count; int64_t offset; int64_t n_buffers; int64_t n_children; const void** buffers; struct ArrowArray** children; struct ArrowArray* dictionary; // Release callback void (*release)(struct ArrowArray*); // Opaque producer-specific data void* private_data; }; #endif // ARROW_C_DATA_INTERFACE #ifndef ARROW_C_STREAM_INTERFACE #define ARROW_C_STREAM_INTERFACE struct ArrowArrayStream { // Callback to get the stream type // (will be the same for all arrays in the stream). // // Return value: 0 if successful, an `errno`-compatible error code otherwise. // // If successful, the ArrowSchema must be released independently from the stream. int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); // Callback to get the next array // (if no error and the array is released, the stream has ended) // // Return value: 0 if successful, an `errno`-compatible error code otherwise. // // If successful, the ArrowArray must be released independently from the stream. int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); // Callback to get optional detailed error information. // This must only be called if the last stream operation failed // with a non-0 return code. // // Return value: pointer to a null-terminated character array describing // the last error, or NULL if no description is available. // // The returned pointer is only valid until the next operation on this stream // (including release). const char* (*get_last_error)(struct ArrowArrayStream*); // Release callback: release the stream's own resources. // Note that arrays returned by `get_next` must be individually released. void (*release)(struct ArrowArrayStream*); // Opaque producer-specific data void* private_data; }; #endif // ARROW_C_STREAM_INTERFACE #endif // ARROW_FLAG_DICTIONARY_ORDERED /// \brief Allocate an external pointer to an ArrowSchema /// /// Allocate an external pointer to an uninitialized ArrowSchema with a finalizer that /// ensures that any non-null release callback in a pointed-to structure will be called /// when the external pointer is garbage collected. static inline SEXP nanoarrow_schema_owning_xptr(void); /// \brief Allocate an external pointer to an ArrowArray /// /// Allocate an external pointer to an uninitialized ArrowArray with a finalizer that /// ensures that any non-null release callback in a pointed-to structure will be called /// when the external pointer is garbage collected. static inline SEXP nanoarrow_array_owning_xptr(void); /// \brief Allocate an external pointer to an ArrowArrayStream /// /// Allocate an external pointer to an uninitialized ArrowArrayStream with a finalizer /// that ensures that any non-null release callback in a pointed-to structure will be /// called when the external pointer is garbage collected. static inline SEXP nanoarow_array_stream_owning_xptr(void); /// \brief Ensure an input SEXP points to an initialized ArrowSchema /// /// This function will always return an ArrowSchema pointer that can be safely /// consumed or raise an error via Rf_error(). This is intended to be used to /// sanitize an *input* ArrowSchema. static inline struct ArrowSchema* nanoarrow_schema_from_xptr(SEXP schema_xptr); /// \brief Ensure an output SEXP points to an uninitialized ArrowSchema /// /// This function will always return an ArrowSchema pointer that can be safely /// used as an output argument or raise an error via Rf_error(). This is intended /// to be used to sanitize an *output* ArrowSchema allocated from R or elsewhere. static inline struct ArrowSchema* nanoarrow_output_schema_from_xptr(SEXP schema_xptr); /// \brief Ensure an input SEXP points to an initialized ArrowArray /// /// This function will always return an ArrowArray pointer that can be safely /// consumed or raise an error via Rf_error(). This is intended to be used to /// sanitize an *input* ArrowArray. static inline struct ArrowArray* nanoarrow_array_from_xptr(SEXP array_xptr); /// \brief Ensure an output SEXP points to an uninitialized ArrowArray /// /// This function will always return an ArrowArray pointer that can be safely /// used as an output argument or raise an error via Rf_error(). This is intended /// to be used to sanitize an *output* ArrowArray allocated from R or elsewhere. static inline struct ArrowArray* nanoarrow_output_array_from_xptr(SEXP array_xptr); /// \brief Ensure an input SEXP points to an initialized ArrowArrayStream /// /// This function will always return an ArrowArrayStream pointer that can be safely /// consumed or raise an error via Rf_error(). This is intended to be used to /// sanitize an *input* ArrowArrayStream. static inline struct ArrowArrayStream* nanoarrow_array_stream_from_xptr( SEXP array_stream_xptr); /// \brief Ensure an output SEXP points to an uninitialized ArrowArrayStream /// /// This function will always return an ArrowArrayStream pointer that can be safely /// used as an output argument or raise an error via Rf_error(). This is intended /// to be used to sanitize an *output* ArrowArrayStream allocated from R or elsewhere. static inline struct ArrowArrayStream* nanoarrow_output_array_stream_from_xptr( SEXP array_stream_xptr); /// \brief Finalize an external pointer to an ArrowSchema /// /// This function is provided for internal use by nanoarrow_schema_owning_xptr() /// and should not be called directly. static void nanoarrow_finalize_schema_xptr(SEXP schema_xptr); /// \brief Finalize an external pointer to an ArrowArray /// /// This function is provided for internal use by nanoarrow_array_owning_xptr() /// and should not be called directly. static void nanoarrow_finalize_array_xptr(SEXP array_xptr); /// \brief Finalize an external pointer to an ArrowArrayStream /// /// This function is provided for internal use by nanoarrow_array_stream_owning_xptr() /// and should not be called directly. static void nanoarrow_finalize_array_stream_xptr(SEXP array_stream_xptr); /// @} // Implementations follow static void nanoarrow_finalize_schema_xptr(SEXP schema_xptr) { struct ArrowSchema* schema = (struct ArrowSchema*)R_ExternalPtrAddr(schema_xptr); if (schema != NULL && schema->release != NULL) { schema->release(schema); } if (schema != NULL) { free(schema); R_ClearExternalPtr(schema_xptr); } } static void nanoarrow_finalize_array_xptr(SEXP array_xptr) { struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr); if (array != NULL && array->release != NULL) { array->release(array); } if (array != NULL) { free(array); R_ClearExternalPtr(array_xptr); } } static void nanoarrow_finalize_array_stream_xptr(SEXP array_stream_xptr) { struct ArrowArrayStream* array_stream = (struct ArrowArrayStream*)R_ExternalPtrAddr(array_stream_xptr); if (array_stream != NULL && array_stream->release != NULL) { array_stream->release(array_stream); } if (array_stream != NULL) { free(array_stream); R_ClearExternalPtr(array_stream_xptr); } } static inline SEXP nanoarrow_schema_owning_xptr(void) { struct ArrowSchema* schema = (struct ArrowSchema*)malloc(sizeof(struct ArrowSchema)); if (schema == NULL) { Rf_error("Failed to allocate ArrowSchema"); } schema->release = NULL; SEXP schema_xptr = PROTECT(R_MakeExternalPtr(schema, R_NilValue, R_NilValue)); SEXP schema_cls = PROTECT(Rf_mkString("nanoarrow_schema")); Rf_setAttrib(schema_xptr, R_ClassSymbol, schema_cls); R_RegisterCFinalizer(schema_xptr, &nanoarrow_finalize_schema_xptr); UNPROTECT(2); return schema_xptr; } static inline SEXP nanoarrow_array_owning_xptr(void) { struct ArrowArray* array = (struct ArrowArray*)malloc(sizeof(struct ArrowArray)); array->release = NULL; SEXP array_xptr = PROTECT(R_MakeExternalPtr(array, R_NilValue, R_NilValue)); SEXP array_cls = PROTECT(Rf_mkString("nanoarrow_array")); Rf_setAttrib(array_xptr, R_ClassSymbol, array_cls); R_RegisterCFinalizer(array_xptr, &nanoarrow_finalize_array_xptr); UNPROTECT(2); return array_xptr; } static inline SEXP nanoarow_array_stream_owning_xptr(void) { struct ArrowArrayStream* array_stream = (struct ArrowArrayStream*)malloc(sizeof(struct ArrowArrayStream)); array_stream->release = NULL; SEXP array_stream_xptr = PROTECT(R_MakeExternalPtr(array_stream, R_NilValue, R_NilValue)); SEXP array_stream_cls = PROTECT(Rf_mkString("nanoarrow_array_stream")); Rf_setAttrib(array_stream_xptr, R_ClassSymbol, array_stream_cls); R_RegisterCFinalizer(array_stream_xptr, &nanoarrow_finalize_array_stream_xptr); UNPROTECT(2); return array_stream_xptr; } static inline struct ArrowSchema* nanoarrow_schema_from_xptr(SEXP schema_xptr) { if (!Rf_inherits(schema_xptr, "nanoarrow_schema")) { Rf_error("`schema` argument that does not inherit from 'nanoarrow_schema'"); } struct ArrowSchema* schema = (struct ArrowSchema*)R_ExternalPtrAddr(schema_xptr); if (schema == NULL) { Rf_error("nanoarrow_schema() is an external pointer to NULL"); } if (schema->release == NULL) { Rf_error("nanoarrow_schema() has already been released"); } return schema; } static inline struct ArrowSchema* nanoarrow_output_schema_from_xptr(SEXP schema_xptr) { if (!Rf_inherits(schema_xptr, "nanoarrow_schema")) { Rf_error("`schema` argument that does not inherit from 'nanoarrow_schema'"); } struct ArrowSchema* schema = (struct ArrowSchema*)R_ExternalPtrAddr(schema_xptr); if (schema == NULL) { Rf_error("nanoarrow_schema() is an external pointer to NULL"); } if (schema->release != NULL) { Rf_error("nanoarrow_schema() output has already been initialized"); } return schema; } static inline struct ArrowArray* nanoarrow_array_from_xptr(SEXP array_xptr) { if (!Rf_inherits(array_xptr, "nanoarrow_array")) { Rf_error("`array` argument that is not a nanoarrow_array()"); } struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr); if (array == NULL) { Rf_error("nanoarrow_array() is an external pointer to NULL"); } if (array->release == NULL) { Rf_error("nanoarrow_array() has already been released"); } return array; } static inline struct ArrowArray* nanoarrow_output_array_from_xptr(SEXP array_xptr) { if (!Rf_inherits(array_xptr, "nanoarrow_array")) { Rf_error("`array` argument that is not a nanoarrow_array()"); } struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr); if (array == NULL) { Rf_error("nanoarrow_array() is an external pointer to NULL"); } if (array->release != NULL) { Rf_error("nanoarrow_array() output has already been initialized"); } return array; } static inline struct ArrowArrayStream* nanoarrow_array_stream_from_xptr( SEXP array_stream_xptr) { if (!Rf_inherits(array_stream_xptr, "nanoarrow_array_stream")) { Rf_error("`array_stream` argument that is not a nanoarrow_array_stream()"); } struct ArrowArrayStream* array_stream = (struct ArrowArrayStream*)R_ExternalPtrAddr(array_stream_xptr); if (array_stream == NULL) { Rf_error("nanoarrow_array_stream() is an external pointer to NULL"); } if (array_stream->release == NULL) { Rf_error("nanoarrow_array_stream() has already been released"); } return array_stream; } static inline struct ArrowArrayStream* nanoarrow_output_array_stream_from_xptr( SEXP array_stream_xptr) { if (!Rf_inherits(array_stream_xptr, "nanoarrow_array_stream")) { Rf_error("`array_stream` argument that is not a nanoarrow_array_stream()"); } struct ArrowArrayStream* array_stream = (struct ArrowArrayStream*)R_ExternalPtrAddr(array_stream_xptr); if (array_stream == NULL) { Rf_error("nanoarrow_array_stream() is an external pointer to NULL"); } if (array_stream->release != NULL) { Rf_error("nanoarrow_array_stream() output has already been initialized"); } return array_stream; } #ifdef __cplusplus } #endif #endif nanoarrow/configure0000755000176200001440000000373114556775567014226 0ustar liggesusers# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # If we are building from within the nanoarrow repo, bootstrap.R will (1) # exist and (2) perform the necessary vendoring steps if [ -f bootstrap.R ]; then $R_HOME/bin/Rscript bootstrap.R fi if [ -f "src/nanoarrow.h" ] && [ -f "src/nanoarrow.c" ]; then echo "Found vendored nanoarrow" exit 0 fi # We have a situation where the package has been built via R CMD build # but there is no vendored nanoarrow. This occurs with # remotes::install_github() with the default arguments. In this case, pull # the latest bundled version from GitHub. To ensure commit-level consistency, # use remotes::install_github(build = FALSE) (which will run cmake to get # a fresh bundle with a specific commit). curl -L https://github.com/apache/arrow-nanoarrow/raw/main/dist/nanoarrow.h \ --output src/nanoarrow.h --silent curl -L https://github.com/apache/arrow-nanoarrow/raw/main/dist/nanoarrow.c \ --output src/nanoarrow.c --silent if [ -f "src/nanoarrow.h" ] && [ -f "src/nanoarrow.c" ]; then echo "Fetched bundled nanoarrow from https://github.com/apache/arrow-nanoarrow/tree/main/dist" exit 0 fi echo "Vendored src/nanoarrow.h and/or src/nanoarrow.c are missing" echo "This source tarball was built incorrectly." exit 1