RandomFieldsUtils/0000755000176200001440000000000014227516720013650 5ustar liggesusersRandomFieldsUtils/NAMESPACE0000644000176200001440000000240014227157055015065 0ustar liggesusers ############exportPattern("^[^\\.]") export(cholx, cholPosDef, Print, solvex, solvePosDef, chol2mv, tcholRHS, crossprodx, scalarx, colMax, rowMeansx, rowProd, SelfDivByRow, quadratic, dotXV, sleep.milli, sleep.micro, hostname, pid, # gpu_info, FileExists, LockRemove, WaitOthers, sortx, orderx, gauss, matern, nonstwm, whittle, I0L0, struveH, struveL, # besselKx, RFoptions, confirm, checkExamples, Dependencies, debugging_level, dbinorm, uses.simd.instruction, misses.simd.instruction, LA_AUTO, LA_INTERN, LA_R, LA_GPU, LA_QUERY, PIVOT_NONE, PIVOT_AUTO, PIVOT_DO, PIVOT_IDX ) useDynLib(RandomFieldsUtils, .registration = TRUE, .fixes = "C_") #useDynLib(spam) importFrom("utils", "str", "packageDescription", "contrib.url", "read.table", "install.packages", "available.packages", "compareVersion", "installed.packages") importFrom("methods", "hasArg", "is") importFrom("grDevices", "dev.off") importFrom("parallel", "detectCores") S3method(print, RFopt) S3method(summary, RFopt) S3method(print, summary.RFopt) S3method(print, RFoptElmnt) S3method(summary, RFoptElmnt) S3method(print, summary.RFoptElmnt) S3method(print, gpu_list) RandomFieldsUtils/man/0000755000176200001440000000000014227157055014425 5ustar liggesusersRandomFieldsUtils/man/RFoptions.Rd0000644000176200001440000006116414227157055016647 0ustar liggesusers\name{RFoptions} \alias{RFoptions} \alias{PIVOT_NONE} \alias{PIVOT_AUTO} \alias{PIVOT_DO} \alias{PIVOT_IDX} \alias{LA_AUTO} \alias{LA_INTERN} \alias{LA_R} \alias{LA_GPU} \alias{LA_QUERY} %\alias{PIVOT_IDXBACK} \alias{PIVOTSPARSE_MMD} \alias{PIVOTSPARSE_RCM} \title{Setting control arguments} \description{ \command{\link{RFoptions}} sets and returns control arguments for the analysis and the simulation of random fields } \usage{ RFoptions(..., no.class=FALSE, install.control=NULL) } \arguments{ \item{...}{arguments in \code{tag = value} form, or a list of tagged values. See \sQuote{Details} for options in package \pkg{RandomFieldsUtils}. } \item{no.class}{logical. If \code{TRUE} the list is returned without class specification. } \item{install.control}{list. See Details, Part 2}. } \details{ The subsections below comment on\cr \bold{1. \code{basic}: Basic options}\cr \bold{2. \code{install.control}}\cr \bold{3. \code{installNrun}: Options for installation and running}\cr \bold{4. \code{solve}: Options for solving linear systems}\cr \bold{5. Reserved words}\cr \cr %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % \bold{16. Options for RFloglikelihood}\cr % % "auto", "full", "composite", "selection" %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \bold{1. Basic options} \describe{ \item{\code{asList}}{logical. Lists of arguments are treated slightly different from non-lists. If \code{asList=FALSE} they are treated the same way as non-lists. This options being set to \code{FALSE} after calling \command{RFoptions} it should be set as first element of a list. Default: \code{TRUE} } \item{\code{cores}}{ Number of cores for multicore algorithms; currently only used for the Cholesky decomposition. Default : \code{1} if the package has been compiled with standard flags of CRAN and \code{0.75 * cores() + 0.25 * cpus()} else. Note that \code{cores} has not effect if set locally in this package or in package miraculix. } \item{\code{cPrintlevel}}{ \code{cPrintlevel} is automatically set to \code{printlevel} when \code{printlevel} is changed. Standard users will never use a value higher than 3. 0 : no messages\cr 1 : messages and warnings when the user's input looks odd\cr 2 : messages (and internal errors) documenting the choice of the simulation method\cr 3 : further user relevant informations\cr 4 : information on recursive function calls\cr 5 : function flow information of central functions \cr 6 : errors that are internally treated\cr 7 : details on building up the covariance structure\cr 8 : details on taking the square root of the covariance matrix\cr 9 : details on intermediate calculations\cr 10 : further details on intermediate calculations\cr Note that \code{printlevel} works on the R level whereas \code{cPrintlevel} works on the C level. \code{cPrintlevel} should be changed only globally. Default: 1 \cr % [also do].\cr } \item{efficient}{ logical. If \code{TRUE} then always the most time efficient code is used. % The value \code{FALSE} for debugging Default: \code{TRUE}. It is strongly recommended to retain this value. } \item{\code{helpinfo}}{logical. If \code{TRUE} then additional information is printed for more efficient programming in R. Default: \code{TRUE} } \item{\code{printlevel}}{If \code{printlevel}\eqn{\le0}{<=0} there is not any output on the screen. The higher the number the more tracing information is given. Standard users will never use a value higher than 3. 0 : no messages\cr 1 : important (error) messages and warnings\cr 2 : less important messages\cr 3 : details, but still for the user\cr 4 : recursive call tracing\cr 5 : function flow information of large functions\cr 6 : errors that are internally treated\cr 7 : details on intermediate calculations\cr 8 : further details on intermediate calculations\cr Default: 1 %[also do].\cr } \item{\code{seed}}{integer (currently only used by the package RandomFields). If \code{NULL} or \code{NA} % \command{\link[base]{set.seed}} \command{set.seed} is \bold{not} called. Otherwise, % \code{\link[base]{set.seed}(seed)} \code{set.seed(seed)} is set before any simulations are performed. If the argument is set locally, i.e., within a function, it has the usual local effect. If it is set globally, i.e. by \command{RFoptions} the \code{seed} is fixed for \bold{all subsequent} calls. If the number of simulations \code{n} is greater than one and if \code{RFoptions(seed=seed)} is set, the \eqn{i}th simulation is started with the seed \sQuote{\code{seed}\eqn{+i-1}}. % The function \code{set.seed} should not be used in case \code{n} % is greater than 1. % %Vgle! %set.seed(5) %RFsimulate(RPschlather(RMmatern(nu=2), xi=1, mu=1, s=1), x, grid=F, n=5)@data %set.seed(5) %RFsimulate(RPschlather(RMmatern(nu=2.01), xi=1, mu=1, s=1), x,grid=F,n=5)@data %RFoptions(cPr=3, seed=5) %RFsimulate(RPschlather(RMmatern(nu=2), xi=1, mu=1, s=1), x, grid=F, n=5)@data %RFsimulate(RPschlather(RMmatern(nu=2.01), xi=1, mu=1, s=1), x, grid=F,n=5)@data } \item{\code{skipchecks}}{logical. If \code{TRUE}, several checks whether the given parameter values and the dimension are within the allowed range is skipped. Do not change the value of this variable except you really know what you do. Default: \code{FALSE} %[also do]. } \item{\code{verbose}}{logical. If \code{FALSE} it identical to \code{printlevel = 1} else to \code{printlevel = 2}. } \item{\code{bigendian}}{logical. Read only.} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \bold{2. \code{install.control}: Details on argument \code{install.control}} \code{install.control} may contain any argument of \command{\link[utils]{install.packages}} except \code{type}. \bold{This options is currently tailored for MS and Linux on Intel machines, only.} The argument \code{configure.args} % (i) is merged with the presetting % of \command{RFoptions}; (ii) may not contain \code{'CXX_FLAGS'} which should be passed as an extra argument with the list. Note that if this argument is given (even with value \code{NULL}), an immediate installation takes place. In case the user tries to force to install 0 packages, an overview over the packages is given. If the user is asked whether re-installation shall take place, user can pass arguments to install.packages, e.g., \code{"quiet=FALSE"}. If \code{install.control} is given, no further argument may be passed to \command{RFoptions}. Additional components of \code{install.control} and special behaviours are: \describe{ \item{\code{path}}{the path to the locally saved tar balls } \item{\code{verbose}, \code{quiet}}{ They affect also the behaviour of \code{RFoptions}. % Within \code{RFoptions}, \code{verbose=TRUE} by default. } \item{\code{force}}{ \describe{ \item{\code{TRUE}}{ reinstallation of all attached libraries based on and including \pkg{RandomFieldsUtils}. I.e., \code{RFoptions(install.control=list(force=TRUE))} is the strongest form of forcing reinstallation. } \item{\code{FALSE}}{In case some packages have to be re-installed the user will be asked. } \item{not given}{ reinstallation of the attached libraries based on and including \pkg{RandomFieldsUtils} that have not been tried yet in the current session. } } } \item{\code{pkgs=NULL}}{ brief overview over the installed packages based on \code{RandomFieldsUtils} } \item{\code{CROSS}}{logical or character. \code{CROSS} is passed to \file{configure.ac}. \describe{ \item{\code{"noflag"}}{ No extra compiler flag is set with respect to SIMD. This is the default. } \item{\code{TRUE}}{each file is compiled with its specific SIMD/AVX compiler flags; this guarantees the compatiblity on a plattform with different sets of kernels. No SIND/AVX flag should be given by the user. Cross-compilation supported; no check is performed whether the code would run on the compiling CPU. } \item{\code{"nosimd"}}{It is assumed that no SIMD is available and the flag "-no-sse2" is set (if possible).} \item{\code{"sse2"}}{Same behaviour as \code{TRUE}, but all CPUs have at least \code{"sse2"} available.} \item{\code{"sse3"}, \code{"ssse3"}, \code{"sse41"}, \code{"avx"}, \code{"avx2"}%, \code{"avx512f"} }{ Alternatives to \code{"sse2"}. Giving the highest guaranteed SIMD recognition leads to the most efficient code. } \item{\code{FALSE}}{each file is compiled with all SIMD/AVX flags recognized by both the CPU and the compiler (no cross-compilation); users may add their own SIMD/AVX flags. This might lead to faster code, but which is not downwards compatible. } \item{\code{NA}}{Same as \code{FALSE} except that the flag \code{-mno-sse2} is set when no SIMD is needed or used. } } This option can be set to \code{"avx"} interactively if \code{install="ask"}. } \item{\code{CC_FLAGS}}{character. Flags passed to \file{configure.ac}. } \item{\code{SIMD_FLAGS}}{character. A subset of \code{"sse2"}, \code{"sse3"}, \code{"ssse3"}, \code{"sse41"}, \code{"avx"}, \code{"avx2"}, \code{"arch=xxx"}, etc. which will be tried instead of default flags. \code{SIMD_FLAGS} is passed to \file{configure.ac}. } \item{\code{LOCAL_ONLY}}{logical. If \code{TRUE}, the web is not searched for the latest version of the package.} \item{\code{MEM_IS_ALIGNED}}{logical. If \code{TRUE}, then the memory is assumed to be aligned. If \code{FALSE} then the SIMD load commands \code{_mm_*load_*} are replaced by \code{_mm_*loadu_*}. If given, then \code{force} is set to \code{TRUE}. } \item{\code{USE_GPU}}{logical. Force or hinder the compilation for the GPU } }% describe in install.control %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \bold{3. \code{installNrun}: Options for installing and for determining basic behaviour} \describe{ % to do \item{\code{install}}{character. Only used by linux systems and alike including macOS The default by CRAN is that SIMD/AVX cannot be used at full extend. \code{install} determines what the action if the compiled version does not use the full CPU capacities. Since the use of GPU is heavily hardware dependent, its auto-recompilation is only performed in tow line of an AVX re-compilation. The users usually use \describe{ \item{\code{"no"}}{no re-installation % No further re-installation in this session possible % s : CROSS=\"avx\ his option guarantees downwards compatibility to avx. See ?RFoptions for details. } \item{\code{"ask"}}{asks whether the library should be reinstalled, using the full capacity of the CPU according to the package.} \item{\code{"install"}}{performs the auto-recompilation without asking. Note that only the command \code{RFoptions(install.control=list(force=TRUE))} forces re-compilation of the currently loaded packages that are based on \pkg{RandomFieldsUtils}. } } Note that, in each session, a package can be reinstalled only. This feature avoids trying to run jobs that cannot be done (e.g.\ due to missing programs of the OS). See argument \code{install.control} for overwriting this behaviour. Default: at starting point it is \code{"ask"} or \code{"no"}, but the value may change during the session. % The user can also indicate what whould be installed: % \describe{ % \item{\code{"sse"}}{installs SSE} % \item{\dots} % \item{\code{"avx2"}}{installs AVX2} % } } \item{\code{installPackages}}{logical. Read only. Indicates whether packages are left to be re-installed. \code{RFoptions(install="no")} sets it to \code{FALSE}. \code{RFoptions(install="no", install="ask")} sets it to \code{TRUE}. } \item{\code{kahanCorrection}}{obsolete. logical. If \code{TRUE}, the Kahan summation algorithm is used for calculating scalar products. Default: false } \item{\code{la_mode}}{determines \describe{ \item{LA_AUTO, \code{"auto"}}{ If a graphic card is recognized, \code{LA_GPU} is used. In all other cases the default is primarily \code{LA_R}. Only on linux systems, the package peforms a simple speed test and takes \code{LA_INTERN} if it is faster than {LA_R}; the time, hence the choice, depends also on the number of cores used.} \item{LA_INTERN, \code{"intern"}}{mostly own algorithms, often based on SIMD/AVX. This option is of interest only if no advanced BLAS/LAPACK has been compiled into R} \item{LA_R, \code{"R"}}{BLAS/LAPACK implementation used by R} \item{LA_GPU, \code{"GPU"}}{ This option is available when the package has been compiled with nvcc. } \item{LA_QUERY, \code{"query"}}{Request on currently used set-up} } Default: \code{LA_AUTO} } \code{mem_is_aligned}{logical. Read only. See \code{MEM_IS_ALIGNED} in \code{install.control}. } \item{\code{warn_parallel}}{Logical. \pkg{RandomFieldsUtils} and packages using it, such as \pkg{RandomFields} and \pkg{miraculix}, should now be prepared for parallelization using package \code{parallel}, for instance. Internal OMP parallelization of \pkg{RandomFieldsUtils} is done, but only at a view points of the subsequent packages. As a few parts cannot be in parallel technically or from a logical point of view, a hint or a warning is given, if such a point is not accessed adequately. These messages can be turned off by \code{warn_parallel = FALSE}. Default: \code{TRUE}. } \item{\code{warn_unknown_option}}{integer. \describe{ \item{\code{0},\code{1},\code{-1}}{ Unknown options are all ignored. If the value is positive, a hint is delivered whenever an unknown option is ignored. } \item{\code{-2},\code{2}}{ Unknown options that start with a capital letter are ignored. All others lead to an error. (Note that all \code{RFoptions} start with a minor letter.) If the value is positive, a hint is delivered whenever an unknown option is ignored. } \item{\code{3},\code{-3}}{ Unknown options that consists of a single capital letter are ignored. All others lead to an error. (Note that all \code{RFoptions} start with a minor letter.) If the value is positive, a hint is delivered whenever an unknown option is ignored. } \item{\code{4}}{(and other values) Any unknown option leads to an error.} } Default for \pkg{RandomFieldsUtils}: \code{3} Default for \pkg{RandomFields}: \code{1} } } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \bold{4. \code{solve}: Options for solving linear systems} \describe{ % to do \item{\code{det_as_log}}{ } \item{\code{eigen2zero}}{ When the svd or eigen decomposition is calculated, all values with modulus less than or equal to \code{eigen2zero} are set to zero. Default: \code{1e-12} } \item{\code{max_chol}}{integer. Maximum number of rows of a matrix in a Cholesky decomposition Default: \eqn{16384} } \item{\code{max_svd}}{integer. Maximum number of rows of a matrix in a svd decomposition Default: \eqn{10000} } \item{\code{pivot_partialdet}}{logical. If \code{TRUE} then in case of low-rank matrices the determinant is calculated only in the part with positive eigenvalues} \item{\code{pivot}}{Type of pivoting for the Cholesky decomposition. Possible values are \describe{ \item{PIVOT_NONE, \code{"no"}}{No pivoting.} \item{PIVOT_AUTO, \code{"auto"}}{If the matrix has a size greater than 3x3 and Choleskey fails without pivoting, privoting is done. For matrices of size less than 4x4, no pivoting and no checks are performed. See also \code{PIVOT_DO}} \item{PIVOT_DO, \code{"do"}}{Do always pivoting. NOTE: privoted Cholesky decomposition yields only very approximately an upper triangular matrix L, but still L^t L = M holds true. % The information about the pivoting sequence are attributed to % the result. } \item{PIVOT_IDX, \code{"idx"}}{uses the same pivoting as in the previous pivoted decomposition. This option becomes relevant only when simulations with different parameters or different models shall be performed with the same seed so that also the pivoting must be coupled. % The information about the pivoting sequence are attributed to % the result. } % \item{PIVOT_IDXBACK}{ same as \code{PIVOT_IDX}, but % the sequence of indices of the pivoting is returned via % \code{RFoptions()$solve$pivot_idx}.} } Default: \code{PIVOT_NONE} } \item{\code{pivot_actual_size}}{integer. Genuine dimension of the linear mapping given by a matrix in \link{cholx}. This is a very rarely used option when pivoting with \code{pivot=PIVOT_IDX}. } \item{\code{pivot_check}}{logical. Only used in pivoted Cholesky decomposition. If \code{TRUE} and a numerically zero diagonal element is detected, it is checked whether the offdiagonal elements are numerically zero as well. (See also \code{pivot_max_deviation} and \code{pivot_max_reldeviation}.) If \code{NA} then only a warning is given. Default: \code{TRUE} } \item{\code{pivot_idx}}{vector of integer. Sequence of pivoting indices in pivoted Cholesky decomposition. Note that \code{pivot_idx[1]} gives the number of indices that will be used. The vector must have at least the length \code{pivot_idx[1] + 1}. Default: \code{NULL} } \item{\code{pivot_relerror}}{positive number. Tolerance for (numerically) negative eigenvalues and for (numerically) overdetermined systems appearing in the pivoted Cholesky decomposition. Default: \code{1e-11} } \item{\code{pivot_max_deviation}}{positive number. Together with \code{pivot_max_reldeviation} it determines when the rest of the matrix (eigenvalues) in the pivoted Cholesky decomposition are considered as zero. Default: \code{1e-10} } \item{\code{pseudoinverse}}{logical. In case of a singular matrix \eqn{M}, shall the pseudo inverse be returned for \code{solvex(M)}? Default: \code{FALSE} } \item{\code{pivot_max_reldeviation}}{positive number. Together with \code{pivot_max_deviation} it determines when the rest of the matrix (eigenvalues) in the pivoted Cholesky decomposition are considered as zero. Default: \code{1e-10} } \item{\code{solve_method}}{ vector of at most 3 integers that gives the sequence of methods in order to inverse a matrix or to calculate its square root: \code{"cholesky"}, \code{"svd"}, \code{"lu"}, % \code{"qr"}, \code{"eigen"} \code{"sparse"}, \code{"method undefined"}. In the latter case, the algorithm decides which method might suit best. % Note that \code{"qr"} returns only the transposed of the inverse! Note that if \code{use_spam} is not \code{false} the algorithm checks whether a sparse matrix algorithm should be used and which is then tried first. Default: \code{"method undefined"}. } \item{\code{spam_factor}}{ integer. See argument \code{spam_sample_n}. Default: 4294967 } \item{\code{spam_min_n}}{ integer vector of size 2. The minimal size for a matrix to apply a sparse matrix algorithms automatically. The second value is used in case the GPU is activated. Default: \code{c(400, 4000)} } \item{\code{spam_min_p} (\code{spam_min_p})}{ a numbers in \eqn{(0,1)} giving the proportion of zero above which an sparse matrix algorithm is used. The second value is used in case the GPU is activated. Default: \code{0.8} (\code{0.9}) } \item{\code{spam_pivot}}{ integer. Pivoting algorithm for sparse matrices: \describe{ \item{PIVOT_NONE}{No pivoting} \item{PIVOTSPARSE_MMD}{} \item{PIVOTSPARSE_RCM}{} } See package \code{spam} for details. Default: PIVOTSPARSE_MMD } \item{\code{spam_sample_n} (\code{spam_sample_n_GPU})}{ Whether a matrix is sparse or not is tested by a \sQuote{random} sample of size \code{spam_sample_n}; The selection of the sample is iteratively obtained by multiplying the index by \code{spam_factor} modulo the size of the matrix. Default: 500 (10000). } \item{\code{spam_tol}}{ largest absolute value being considered as zero. Default: \code{DBL_EPSILON} } \item{\code{svdtol}}{Internal. When the svd decomposition is used for calculating the square root of a matrix then the absolute componentwise difference between this matrix and the square of the square root must be less than \code{svdtol}. No check is performed if \code{svdtol} is not positive. Default: \code{0} } \item{\code{use_spam}}{ Should the package \code{spam} (sparse matrices) be used for matrix calculations? If \code{TRUE} \pkg{spam} is always used. If \code{FALSE}, it is never used. If \code{NA} its use is determined by the size and the sparsity of the matrix. Default: \code{NA}. } } \bold{5. Reserved Words} \describe{ \item{\code{list_}}{ \code{list_} usually equals the output of \code{RFoptions()}. This argument is used to reset the RFoptions. Some of the options behave differently if passed through \code{list_}. E.g. a warning counter is not reset. The argument \code{list_} cannot be combined with any other arguments. } \item{\code{getoptions_}}{string vector of prefixes that indicate classes of options. In this package they can be \code{"basic"} and \code{"solve"}. (E.g. package \pkg{RandomFields} has many more classes of options.) The given classes of options are then returned by \code{RFoptions()}. Note that the values are the previous values. \code{getoptions_} must always be the very first argument. } \item{\code{saveoptions_}}{string vector of prefixes. Same as for \code{getoptions_}, except that important classes are always returned and thus should not be given. Hence \code{saveoptions_} is often a convenient short cut for \code{getoptions_}. The class always included in this package is \code{"basic"}, in package \pkg{RandomFields} these are the two classes \code{"basic"} and \code{"general"}. \code{saveoptions_} must always be the very first argument. In particular, it may not given at the same time with \code{getoptions_}. } \item{\code{local_}}{logical. This options is allowed only when advanced packages are used, see \pkg{RandomFields}. } \item{\code{warnUnknown_}}{integer. Same as option \code{warn_unknown_option}, except that its value overwrites the value of \code{warn_unknown_option} in the current command \code{RFoptions}. This options must be placed between \code{CODE} and \code{getoptions_}, if the latter are used. } } } \value{ \code{NULL} if any argument is given, and the full list of arguments, otherwise. } \me \examples{ % library(RandomFieldsUtils) n <- 10 M <- matrix(1, ncol=n, nrow=n) \dontrun{ try(chol(M)) ## error, as M is not strictly positive definite try(cholx(M)) ## also fails } %cholx(M) ## works RFoptions(la_mode=LA_INTERN, pivot=PIVOT_AUTO) cholx(M) ## works RFoptions(la_mode=LA_R) RFoptions(solve_method="svd", pseudoinverse=TRUE) solvex(M) RFoptions(solve_method="method undefined", pseudoinverse=FALSE) } \keyword{spatial} RandomFieldsUtils/man/rowMeansx.Rd0000644000176200001440000000513714227157055016705 0ustar liggesusers\name{rowMeansx} \alias{rowMeans} \alias{rowMeansx} \alias{colMax} \alias{rowProd} \alias{SelfDivByRow} \alias{quadratic} \alias{dotXV} \alias{crossprodx} \alias{scalarx} \title{Some Further Row and Column Functions} \description{ The function \command{rowMeansx} returns weighted row means;\cr the function \command{colMax} returns column maxima;\cr the function \command{rowProd} returns the product of each row;\cr the function \command{quadratic} calculates a quadratic form\cr the function \command{SelfDivByRow} devides each column by a scalar;\cr the function \command{dotXV} calculates columnwise the dot product;\cr the function \command{crossprodx} calculates the cross product (using AVX);\cr the function \command{scalarx} calculates the scalar product (using AVX);\cr } \usage{ rowMeansx(x, weight=NULL) colMax(x) rowProd(x) SelfDivByRow(x, v) quadratic(x, v) dotXV(x, w) crossprodx(x,y,mode=-1) scalarx(x, y, mode=0) } \arguments{ \item{x}{numerical (or logical) matrix} \item{v}{vector whose length equals the number of columns of \code{x}} \item{w}{vector whose length equals the number of rows of \code{x}} \item{weight}{numerical or logical vector of length \code{nrow(x)}} \item{y}{numerical matrix} \item{mode}{integer between 0 and 8 or negative, indicating that the default value should be used. Determine the algorithm how the scalar product is calculated. These values are experimental and may change their meaning. } } \details{ \code{quadratic(x, v)} calculates the quadratic form \eqn{v^\top x v}; The matrix \code{x} must be squared. } \value{ \command{rowMeansx} returns a vector of length\code{nrow(x)}. \command{colMax} returns a vector of length \code{ncol(x)}. \command{rowProd} returns a vector of length \code{nrow(x)}. \command{quadratic} returns a scalar. \command{SelfDivByRow} returns a matrix of same size as \code{x}. \command{dotXV} returns a matrix of same size as \code{x}. } \me \examples{% library(RandomFieldsUtils) c <- if (interactive()) 10000 else 10 r <- if (interactive()) 20000 else 20 M <- matrix(nr=r, 1:(c * r)) ## unweighted means, compare to rowMeans print(system.time(m1 <- rowMeans(M))) print(system.time(m2 <- rowMeansx(M))) stopifnot(all.equal(m1, m2)) ## weighted row means, compare to rowMeans W <- 1 / (ncol(M) : 1) print(system.time({M0 <- t(W * t(M)); m1 <- rowMeans(M0)})) print(system.time(m2 <- rowMeansx(M, W))) stopifnot(all.equal(m1, m2)) print(system.time(m1 <- apply(M, 2, max))) print(system.time(m2 <- colMax(M))) stopifnot(m1 == m2) } \keyword{utilities} % LocalWords: pid unix Schlather url RandomFieldsUtils/man/cholPosDef.Rd0000644000176200001440000000576714227157055016761 0ustar liggesusers\name{Cholesky} \alias{cholesky} \alias{chol} \alias{cholx} \alias{cholPosDef} \alias{chol2mv} \alias{tcholRHS} \title{Cholesky Decomposition of Positive Definite Matrices} \description{ This function calculates the Cholesky decomposition of a matrix. } \usage{ cholx(a) chol2mv(C, n) tcholRHS(C, RHS) %, sparse=NA, method=-1) } \arguments{ \item{a}{a square real-valued positive definite matrix } \item{C}{a (pivoted) Cholesky decomposition calculated by \command{cholx}} \item{n}{integer. Number of realisations of the multivariate normal distribution} \item{RHS}{vector} % \item{sparse}{logical or \code{NA}. % If \code{NA} the function determines whether a sparse % matrix algorithm of the package \pkg{spam} should be used. % } % \item{method}{integer vector. % If the sparse matrix algorithm is not used, \code{method} % determines the alternative algorithm. See Details. % } } \value{ \command{cholx} returns a matrix containing the Cholesky decomposition (in its upper part). \command{chol2mv} takes the Cholesky decomposition and returns a \code{n} realisations of a multivariate normal distribution with mean 0 and covariance function \code{a} \command{tcholRHS} multiplies the vector \code{RHS} from the right to \emph{lower} triangular matrix of the Cholesky decomposition. See examples below. } \details{ If the matrix is diagonal direct calculations are performed. %Else if the matrix is sparse the package \pkg{spam} is used. Else the Cholesky decomposition is tried. } \references{ Harbrecht, H., Peters, M., Schneider, R. (2012) On the low-rank approximation by the pivoted Cholesky decomposition. \emph{Appl. Num. Math.} \bold{62}, 428--440. } %\seealso{ % \link{chol2mv}, \link{tcholRHS} % \link[spam]{chol.spam} in the package \pkg{spam} %} \me \keyword{math} \examples{ ########################## ## Example showing the use of chol2mv and tcholRHS n <- 10 M <- matrix(nc=n, runif(n^2)) M <- M \%*\% t(M) + diag(n) C <- cholx(M) set.seed(0) v1 <- chol2mv(C, 1) set.seed(0) v2 <- tcholRHS(C, rnorm(n)) stopifnot(all(v1 == v2)) ########################## ## The following example shows pivoted Cholesky can be used ## and the pivotation permutation can be transferred to ## subsequent Cholesky decompositions % library(RandomFieldsUtils) set.seed(0) n <- if (interactive()) 1000 else 100 x <- 1:n y <- runif(n) M <- x \%*\% t(x) + rev(x) \%*\% t(rev(x)) + y \%*\% t(y) ## do pivoting RFoptions(pivot = PIVOT_DO, la_mode=LA_INTERN) print(system.time(C <- cholx(M))) print(range(crossprod(C) - M)) str(C) ## use the same pivoted decomposition as in the previous decomposition M2 <- M + n * diag(1:n) RFoptions(pivot = PIVOT_IDX, la_mode=LA_INTERN, pivot_idx = attr(C, "pivot_idx"), pivot_actual_size = attr(C, "pivot_actual_size")) print(system.time(C2 <- cholx(M2))) print(range(crossprod(C2) - M2)) range((crossprod(C2) - M2) / M2) str(C2) RFoptions(pivot = PIVOT_AUTO, la_mode = LA_AUTO) \dontshow{RFoptions(pivot_idx = integer(0))} } RandomFieldsUtils/man/nonstwm.Rd0000644000176200001440000000302414227157055016420 0ustar liggesusers\name{nonstwm} \alias{nonstwm} \title{nonstwm} \description{ The non-stationary Whittle-Matern model \eqn{C} is given by \deqn{C(x, y)=\Gamma(\mu) \Gamma(\nu(x))^{-1/2} \Gamma(\nu(y))^{-1/2} W_{\mu} (f(\mu) |x-y|)}{C(x, y)=\Gamma(\mu) \Gamma(\nu(x))^{-1/2} \Gamma(\nu(y))^{-1/2} W_{\mu} (f(\mu) |x-y|)} where \eqn{\mu = [\nu(x) + \nu(y)]/2}, and \eqn{\nu} must a positive function. \eqn{W_{\mu}} is the covariance function \command{\link{whittle}}. The function \eqn{f} takes the following values \describe{ \item{\code{scaling = "whittle"} :}{\eqn{f(\mu) = 1}} \item{\code{scaling = "matern"} :}{\eqn{f(\mu) = \sqrt{2\nu}}} \item{\code{scaling = "handcockwallis"} :}{\eqn{f(\mu) = 2\sqrt{\nu}}} \item{\code{scaling} = s, numerical :}{\eqn{f(\mu) = s * \sqrt{nu}}} } } \usage{ nonstwm(x, y, nu, log=FALSE, scaling=c("whittle", "matern", "handcockwallis")) } \arguments{ \item{x, y}{numerical vectors of the same length} \item{nu}{positive value or a function with positive values and \code{x} as argument} \item{log}{logical. If \code{TRUE} the logirithm of the covariance function is returned.} \item{scaling}{positive value or character; see Details.} } \value{ A single value is returned. } \references{ \itemize{ \item Stein, M. (2005) Nonstationary Spatial Covariance Functions. Tech. Rep., 2005 } } \me \seealso{ \command{\link{matern}}. % For more details see \command{\link[RandomFields]{RMnonstwm}}. } \examples{ nonstwm(2, 1, sin) } \keyword{spatial} \keyword{models} RandomFieldsUtils/man/matern.Rd0000644000176200001440000000575114227157055016212 0ustar liggesusers\name{matern} \alias{whittle} \alias{matern} \alias{sobolev} \alias{whittle-matern} \title{Whittle-Matern Model} \description{ \command{matern} calculates the Whittle-Matern covariance function (Soboloev kernel). The Whittle model is given by \deqn{C(r)=W_{\nu}(r)=2^{1- \nu} \Gamma(\nu)^{-1}r^{\nu}K_{\nu}(r)}{C(r)=W_{\nu}(r)=2^{1- \nu} \Gamma(\nu)^{-1}r^{\nu}K_{\nu}(r)} where \eqn{\nu > 0}{\nu > 0} and \eqn{K_\nu}{K_\nu} is the modified Bessel function of second kind. The Matern model is given by \deqn{C(r) = \frac{2^{1-\nu}}{\Gamma(\nu)} (\sqrt{2\nu}r)^\nu K_\nu(\sqrt{2\nu}r)}{C(r) = 2^{1- \nu} \Gamma(\nu)^{-1} (\sqrt{2\nu} r)^\nu K_\nu(\sqrt{2\nu} r)} The Handcock-Wallis parametrisation equals \deqn{C(r) = \frac{2^{1-\nu}}{\Gamma(\nu)} (2\sqrt{\nu}r)^\nu K_\nu(2\sqrt{\nu}r)}{C(r) = 2^{1- \nu} \Gamma(\nu)^{-1} (2\sqrt{\nu} r)^\nu K_\nu(2\sqrt{\nu} r)} } \usage{ whittle(x, nu, derivative=0, scaling=c("whittle", "matern", "handcockwallis")) matern(x, nu, derivative=0, scaling=c("matern", "whittle", "handcockwallis")) } \arguments{ \item{x}{numerical vector; for negative values the modulus is used} \item{nu}{numerical vector with positive entries} \item{derivative}{value in \code{0:4}. } \item{scaling}{numerical vector of positive values or character; see Details.} } \value{ If \code{derivative=0}, the function value is returned, otherwise the \code{derivative}th derivative. A vector of \code{length(x)} is returned; \code{nu} is recycled; \code{scaling} is recycled if numerical. If \code{scaling} has a numerical values \eqn{s}, the covariance model equals \deqn{C(r) = \frac{2^{1-\nu}}{\Gamma(\nu)} (s\sqrt{\nu}r)^\nu K_\nu(s\sqrt{\nu}r)}{C(r) = 2^{1- \nu} \Gamma(\nu)^{-1} (s\sqrt{\nu} r)^\nu K_\nu(s\sqrt{\nu} r)} The function values are rather precise even for large values of \code{nu}. } \references{ Covariance function \itemize{ \item Chiles, J.-P. and Delfiner, P. (1999) \emph{Geostatistics. Modeling Spatial Uncertainty.} New York: Wiley. \item Gelfand, A. E., Diggle, P., Fuentes, M. and Guttorp, P. (eds.) (2010) \emph{Handbook of Spatial Statistics.} Boca Raton: Chapman & Hall/CRL. \item Guttorp, P. and Gneiting, T. (2006) Studies in the history of probability and statistics. XLIX. On the Matern correlation family. \emph{Biometrika} \bold{93}, 989--995. \item Handcock, M. S. and Wallis, J. R. (1994) An approach to statistical spatio-temporal modeling of meteorological fields. \emph{JASA} \bold{89}, 368--378. \item Stein, M. L. (1999) \emph{Interpolation of Spatial Data -- Some Theory for Kriging.} New York: Springer. } } \me \seealso{ \command{\link{nonstwm}} % For more details see also \command{\link[RandomFields]{RMmatern}}. } \keyword{spatial} \keyword{models} \keyword{math} \examples{% library(RandomFieldsUtils) x <- 3 confirm(matern(x, 0.5), exp(-x)) confirm(matern(x, Inf), gauss(x/sqrt(2))) confirm(matern(1:2, c(0.5, Inf)), exp(-(1:2))) } RandomFieldsUtils/man/internal.Rd0000644000176200001440000000425714227157055016540 0ustar liggesusers\name{Internal functions} \alias{checkExamples} \alias{Dependencies} \alias{debugging_level} \title{Internal functions} \description{ These functions are internal and should not be used. } \usage{ checkExamples(exclude = NULL, include=1:length(.fct.list), ask=FALSE, echo=TRUE, halt=FALSE, ignore.all = FALSE, path=package, package = "RandomFields", read.rd.files=TRUE, local = FALSE, libpath = NULL, single.runs = FALSE, reset, catcherror=TRUE) Dependencies(pkgs = all.pkgs, dir = "Dependencies", install = FALSE, check=TRUE, reverse=FALSE, package="RandomFields") debugging_level() } \arguments{ \item{exclude, include, ask, echo, halt, ignore.all, path, package, read.rd.files, local, libpath, single.runs, reset, catcherror }{internal; ignore.all refers to the \sQuote{all} export statement in the namespace -- whether this should be ignored. If \code{read.rf.files} is \code{TRUE} or a path to the Rd files, then the man pages are analysed to get all examples; \code{ignore.all} is then ignored. If \code{FALSE} only examples of functions (which are searched in the environments) are run. } \item{pkgs, dir,install, check, reverse}{internal } } \me \keyword{spatial} \examples{ ## internal function: no examples given \dontshow{\dontrun{ ## OK ## check own examples checkExamples(ignore.all=TRUE, halt=!TRUE, package="RandomFieldsUtils") ### Check the examples of the other packages: dep.packages <- c( #"DSpat","lgcp", "constrainedKriging", "MarkedPointProcess", "Geneland", "glmmBUGS", "ProbForecastGOP","geoR", "CompRandFld", ## RFsim does not work in version 2.1.18 "fractaldim", "rpanel", "spatstat") #for (i in dep.packages) library(i, character.only=TRUE) #for (i in dep.packages) install.packages(i) not.working <- list() for (.i in 1:length(dep.packages)) { not.working[[.i]] <- checkExamples(path=paste("~/TMP/dep.packages", dep.packages[.i], sep="/"), package=dep.packages[.i]) Print(.i, not.working); repeat{ if (readline()=="e") break} } Print(not.working) }} } RandomFieldsUtils/man/confirm.Rd0000644000176200001440000000120614227157055016350 0ustar liggesusers\name{confirm} \alias{confirm} \title{Test if Two Objects are (Nearly) Equal} \description{ \code{confirm(x, y)} is a utility to compare R objects \code{x} and \code{y} testing \sQuote{near equality} base on \command{\link[base]{all.equal}}. It is written too allow different behaviour on different operating systems } \usage{ confirm(x, y, ...) } \arguments{ \item{x,y,...}{see \command{\link[base]{all.equal}}} } \value{ Only \code{TRUE} or error in linux-gnu. Otherwise logical. } \me \examples{ x <- 3 confirm(gauss(x), exp(-x^2)) } \keyword{sysdata} \keyword{utilities} % LocalWords: pid unix Schlather url RandomFieldsUtils/man/uses.instruction.set.Rd0000644000176200001440000000166614227157055021056 0ustar liggesusers\name{Instruction Set} \alias{uses.simd.instruction} \alias{misses.simd.instruction} \title{ CPU instruction set } \description{ The function checks whether a certain instruction is used (missed) under the current compilation of a package. } \usage{ uses.simd.instruction(which=NULL, pkgs=NULL) misses.simd.instruction(which=NULL, pkgs=NULL) } \arguments{ \item{which}{character vector with values in \code{"SSE2", "SSSE3", "AVX", "AVX2", "CUDA"}} \item{pkgs}{character vector or missing.} } %\details{} \value{ logical vector of length \code{which} or matrix with number of rows equal to the length of \code{which}. An element is \code{TRUE} if the instruction set is used (missed) by the package. If an arguments is \code{NULL} all available information is given. } \me \examples{ % library(RandomFieldsUtils) uses.simd.instruction() misses.simd.instruction() } %library(gtools); keywords() \keyword{sysdata} RandomFieldsUtils/man/dbinorm.Rd0000644000176200001440000000117514227157055016352 0ustar liggesusers\name{dbinorm} \alias{dbinorm} \title{Density of a bivariate normal distribution} \description{ The function calculates the value of a bivariate normal distribution with mean 0. } \usage{ dbinorm (x, S) } \arguments{ \item{x}{ a matrix containing the \eqn{x} values and the \eqn{y} values in the first and second row respectively. Or it is a list of two vectors. } \item{S}{the covariance matrix; currently only diagonal matrix possible} } \value{ a vector according to the size of \code{x} } \me \examples{ x <- matrix(1:6, nc=2) + 0.0 C <- diag(c(1,2)) dbinorm(x, C) } \keyword{utilities} \keyword{misc} RandomFieldsUtils/man/macros/0000755000176200001440000000000014227157055015711 5ustar liggesusersRandomFieldsUtils/man/macros/allg_defn.Rd0000644000176200001440000000130514227157055020112 0ustar liggesusers% only change in RandomFieldsUtils!! \newcommand{\mysoftware}{\url{https://www.wim.uni-mannheim.de/schlather/publications/software}} \newcommand{\martin}{Martin Schlather, \email{schlather@math.uni-mannheim.de}, \url{https://www.wim.uni-mannheim.de/schlather/}} \newcommand{\marco}{Marco Oesting, \email{marco.oesting@mathematik.uni-stuttgart.de}, \url{https://www.isa.uni-stuttgart.de/institut/team/Oesting/}} \newcommand{\kirstin}{Kirstin Strokorb, \email{strokorbk@cardiff.ac.uk} \url{https://www.cardiff.ac.uk/people/view/542989-Dr-Kirstin-Strokorb}} \newcommand{\me}{\author{\martin}} \newcommand{\KMM}{\author{\kirstin \marco, \martin}} \newcommand{\RFU}{See \link[RandomFieldsUtils]{RFoptions}} RandomFieldsUtils/man/fileexists.Rd0000644000176200001440000000547014227157055017101 0ustar liggesusers\name{FileExists} \alias{FileExists} \alias{LockRemove} \alias{LockFile} \alias{WaitOthers} \title{Files} \description{ The function \code{FileExists} checks whether a file or a lock-file exists The function \code{LockRemove} removes a lock-file } \usage{ FileExists(file, printlevel=RFoptions()$basic$printlevel) LockFile(file, printlevel=RFoptions()$basic$printlevel) LockRemove(file) WaitOthers(file, i, cores, ideal.processes=ceiling(cores * 1.25), max.processes=ceiling(cores * 1.5), distance=5, time=5, path="./") } \arguments{ \item{file}{name of the data file} \item{printlevel}{if \code{PrintLevel<=1} no messages are displayed} \item{i}{integer; current value of process, usually the number of a loop index} \item{cores}{the number of cores on the machine} \item{ideal.processes,max.processes,distance}{integer. See Details} \item{time}{in minutes a process waits until it rechecks its environment} \item{path}{the current path of \code{file}} } \details{ \code{FileExists} checks whether file or file.lock exists. If none of them exists \code{file}.lock is created and hostname and PID are written into \code{file}.lock. This is useful if several processes use the same directory. Further, it is checked whether another process has tried to create the same file in the same instance. In this case \code{FileExists} returns for at least one of the processes that \code{file}.lock has already been created. \code{LockFile} is the same as \code{FileExists} except that it does not check whether \code{file} already exists. \command{WaitOthers} waits for others if more than \code{ideal.processes} processes have their value is less than \code{i} or if more than \code{cores} processes have their value is less than \code{i}-\code{distance}. It also waits if there are alreay \code{max.processes} are active. Note that \command{WaitOthers} write a file with ending \sQuote{.wait}, which is also deleted be \command{LockRemove}. } \value{ \code{FileExists} returns \item{1}{if \code{file} already exists} \item{2}{if \code{file}.lock already exists} \item{3}{if \code{file}.lock was tried to be created, but another process inferred and got priority} \item{0}{otherwise, \code{file} and \code{file}.lock did not exist and \code{file}.lock has been created} } \me \examples{ \dontrun{ ## the next command checks whether the file 'data.rda' ## or the file 'data.rda.lock' exists. If so, a positive ## value is returned. If not, the file 'data.rda.lock' ## is created and the value 0 returned. FileExists("data.rda") ## the next command deletes the file 'data.rda.lock' LockRemove("data.rda") } } \keyword{file} \keyword{utilities} % LocalWords: FileExists LockRemove PrintLevel RFoptions PID Schlather url RandomFieldsUtils/man/solvePosDef.Rd0000644000176200001440000000717614227157055017160 0ustar liggesusers\name{solve} \alias{solvePosDef} \alias{solvex} \alias{solve} \title{Solve a System of Equations for Positive Definite Matrices} \description{ This function solves the equality \eqn{a x = b} for \eqn{x} where \eqn{a} is a \bold{positive definite} matrix and \eqn{b} is a vector or a matrix. It is slightly faster than the inversion by the \code{\link[base]{chol}}esky decomposition and clearly faster than \code{\link[base]{solve}}. It also returns the logarithm of the determinant at no additional computational costs. } \usage{ solvex(a, b=NULL, logdeterminant=FALSE) %, sparse=NA, method=-1) } \arguments{ \item{a}{a square real-valued matrix containing the coefficients of the linear system. Logical matrices are coerced to numeric. } \item{b}{ a numeric or complex vector or matrix giving the right-hand side(s) of the linear system. If missing, \code{b} is taken to be an identity matrix and \code{solvex} will return the inverse of \code{a}. } \item{logdeterminant}{logical. whether the logarithm of the determinant should also be returned } } \value{ If \code{logdeterminant=FALSE} the function returns a vector or a matrix, depending on \code{b} which is the solution to the linear equation. Else the function returns a list containing both the solution to the linear equation and the logarithm of the determinant of \code{a}. } \details{ % The values of \code{method} could be: % \itemize{ % \item \code{<0} : If the matrix is diagonal direct calculations are performed. Else if the matrix is sparse the package \pkg{spam} is used. Else the Cholesky decomposition is tried. Note that with \code{RFoptions(pivot= )} pivoting can be enabled. Pivoting is about 30\% slower. If it fails, the eigen value decomposition is tried. } \references{ % See \link[spam]{chol.spam} of the package \pkg{spam} See \code{chol.spam} of the package \pkg{spam}. } %\seealso{ % \link[spam]{chol.spam} in the package \pkg{spam} %} \me \keyword{math} \examples{ % library(RandomFieldsUtils) RFoptions(solve_method = "cholesky", printlevel=1) set.seed(1) n <- 1000 x <- 1:n y <- runif(n) ## FIRST EXAMPLE: full rank matrix M <- exp(-as.matrix(dist(x) / n)) b0 <- matrix(nr=n, runif(n * 5)) b <- M \%*\% b0 + runif(n) ## standard with 'solve' print(system.time(z <- zR <- solve(M, b))) print(range(b - M \%*\% z)) stopifnot(all(abs((b - M \%*\% z)) < 2e-11)) ## using exactly the algorithm used in R RFoptions(pivot=PIVOT_NONE, la_mode=LA_R) ## (default) print(system.time(z <- solvex(M, b))) print(range(b - M \%*\% z)) stopifnot(all(z == zR)) ## Without pivoting, internal code: RFoptions(pivot=PIVOT_NONE, la_mode=LA_INTERN) ## (default) print(system.time(z <- solvex(M, b))) print(range(b - M \%*\% z)) stopifnot(all(abs((b - M \%*\% z)) < 2e-11)) ## Pivoting is slower here: RFoptions(pivot=PIVOT_DO, la_mode=LA_INTERN) print(system.time(z <- solvex(M, b))) print(range(b - M \%*\% z)) stopifnot(all(abs((b - M \%*\% z)) < 2e-11)) ## SECOND EXAMPLE: low rank matrix M <- x \%*\% t(x) + rev(x) \%*\% t(rev(x)) + y \%*\% t(y) b1 <- M \%*\% b0 ## Without pivoting, it does not work RFoptions(pivot=PIVOT_NONE, la_mode=LA_R) \dontrun{try(solve(M, b1))} RFoptions(pivot=PIVOT_NONE, la_mode=LA_INTERN) \dontrun{try(solvex(M, b1))} ## Pivoting works -- the precision however is reduced : RFoptions(pivot=PIVOT_DO, la_mode=LA_INTERN) print(system.time(z1 <- solvex(M, b1))) print(range(b1 - M \%*\% z1)) stopifnot(all(abs((b1 - M \%*\% z1)) < 2e-6)) ## Pivoting fails, when the equation system is not solvable: b2 <- M + runif(n) \dontrun{try(solvex(M, b2))} RFoptions(pivot = PIVOT_AUTO, la_mode = LA_AUTO) } RandomFieldsUtils/man/hostname.Rd0000644000176200001440000000121014227157055016524 0ustar liggesusers\name{host} \alias{hostname} \alias{pid} \title{System calls} \description{ The functions \code{hostname} and \code{pid} return the host name and the PID, respectively. } \usage{ hostname() pid() } \details{ If R runs on a unix platform the host name and the PID are returned, otherwise the empty string and naught, respectively. } \value{ \item{hostname}{returns a string} \item{pid}{returns an unsigned integer} } \me \examples{ cat("The name of your computer is '", hostname(), "'. Your R program has current pid ", pid(), ".\n", sep="") } \keyword{sysdata} \keyword{utilities} % LocalWords: pid unix Schlather url RandomFieldsUtils/man/Print.Rd0000644000176200001440000000107414227157055016012 0ustar liggesusers\name{Print} \alias{Print} \title{Print method returning also the names automatically} \description{ prints variable names and the values } \usage{ Print(..., digits = 6, empty.lines = 2) } \arguments{ \item{...}{any object that can be \command{print}-ed} \item{digits}{see \code{\link{print}}} \item{empty.lines}{number of leading empty lines} } \value{ prints the names and the values; for vectors \command{cat} is used and for lists \command{str} } \me \keyword{print} \examples{ a <- 4 b <- list(c=5, g=7) m <- matrix(1:4, nc=2) Print(a, b, m) }RandomFieldsUtils/man/orderx.Rd0000644000176200001440000000267514227157055016231 0ustar liggesusers\name{orderx} \alias{orderx} \title{Ordering Permutation} \description{ \command{orderx} has the same functionality as \command{\link[base]{order}}, except that \code{orderx(..., from=from, to=to)} is the same as \code{order[from:to]} } \usage{ orderx(x, from=1, to=length(x), decreasing=FALSE, na.last = NA) } \arguments{ \item{x}{an atomic vector} \item{from,to}{\code{order(..., from=from, to=to)} equals \code{order(...)[from:to]}} \item{decreasing}{ logical. Should the sort order be increasing or decreasing? } \item{na.last}{for controlling the treatment of \code{NA}s. If \code{TRUE}, missing values in the data are put last; if \code{FALSE}, they are put first; if \code{NA}, they are removed (see the Notes in \command{\link[base]{order}}) } } \value{ integer vector of length \code{to}-\code{from}+1. } \details{ The smaller the difference \code{to}-\code{from} is compared to the length of \code{x}, the faster is \command{orderx} compared to \link[base]{order}. Particularly, \code{orderx(..., from=k, to=k)} is much faster than \code{order(...)[k]}. \command{orderx} is never really slower than \command{order}. For further details see \link[base]{order}. } \seealso{ \link{sortx} } \examples{ x <- runif(10^6) k <- 10 system.time(y<-order(x)[1:k]) system.time(z<-orderx(x, from=1, to=k)) ## much faster stopifnot(all(x[y ]== x[z])) ## same result } \me \keyword{univar} \keyword{manip} RandomFieldsUtils/man/sleep.Rd0000644000176200001440000000062014227157055016022 0ustar liggesusers\name{sleep.milli} \alias{sleep.milli} \alias{sleep.micro} \alias{sleep} \title{Sleep} \description{ Process sleeps for a given amount of time } \usage{ sleep.milli(n) sleep.micro(n) } \arguments{ \item{n}{integer. sleeping time units} } \value{ No value is returned. } \me \examples{ ## next command waits half a second before returning sleep.milli(500) } \keyword{utilities} \keyword{misc} RandomFieldsUtils/man/Struve.Rd0000644000176200001440000000307714227157055016213 0ustar liggesusers\name{Struve} \alias{struve} \alias{Struve} \alias{struveH} \alias{struveL} \alias{bessel} \alias{I0L0} \alias{I0ML0} \title{Modified Struve functions and related functions} \description{ These functions return the values of the modified Struve functions and related functions } \usage{ struveH(x, nu) struveL(x, nu, expon.scaled=FALSE) I0L0(x) } \arguments{ \item{x}{non-negative numeric vector} \item{nu}{numeric vector} \item{expon.scaled}{logical; if \code{TRUE}, the results are exponentially scaled in order to avoid overflow or underflow respectively. } } \value{ Numeric vector with the (scaled, if \code{expon.scaled = TRUE}) values of the corresponding function. The length of the result is the maximum of the lengths of the arguments \code{x} and \code{nu}. The two arguments are recycled to that length. } \details{ \code{I0L0} returns %\code{\link[base]{besselI}(nu=0)} \code{besselI(nu=0)}. minus \code{struveL(nu=0)}. } \references{ \itemize{% \item MacLeod, A.J. (1993) Chebyshev expansions for modified Struve and related functions, \emph{Mathematics of Computation}, \bold{60}, 735-747 \item Abramowitz, M., and Stegun, I.A. (1984) \emph{Pocketbook of Mathematical Functions}, Verlag Harry Deutsch } } \seealso{ \link[base]{besselI} } \examples{ if (FALSE) { x <- seq(1, 2, 0.1) struveH(x, 0) struveH(x, 1) I0L0(x) - (besselI(x, nu=0) - struveL(x, 0)) besselI(x, nu=1) - struveL(x, 1) ## cf. Abramovitz & Stegun, table 12.1 } } \me \keyword{math} RandomFieldsUtils/man/sortx.Rd0000644000176200001440000000273414227157055016101 0ustar liggesusers\name{sortx} \alias{sortx} \title{Sorting Vectors} \description{ \command{sortx} has the same functionality as \command{\link[base]{sort}}, except that \code{sortx(..., from=from, to=to)} is the same as \code{sort[from:to]} Sort a vector or factor into ascending or descending order. } \usage{ sortx(x, from=1, to=length(x), decreasing=FALSE, na.last = NA) } \arguments{ \item{x}{an atomic vector} \item{from,to}{\code{sort(..., from=from, to=to)} equals \code{sort(...)[from:to]}} \item{decreasing}{ logical. Should the sort sort be increasing or decreasing? } \item{na.last}{for controlling the treatment of \code{NA}s. If \code{TRUE}, missing values in the data are put last; if \code{FALSE}, they are put first; if \code{NA}, they are removed (see the Notes in \command{\link[base]{sort}}) } } \value{ vector of length \code{to}-\code{from}+1. } \details{ The smaller the difference \code{to}-\code{from} is compared to the length of \code{x}, the faster is \command{sortx} compared to \link[base]{sort}. Particularly, \code{sortx(..., from=k, to=k)} is much faster than \code{sort(...)[k]}. For further details see \link[base]{sort}. } \seealso{ \link{orderx} } \examples{ x <- runif(10^6) k <- 10 system.time(y<-sort(x)[1:k]) system.time(z<-sortx(x, from=1, to=k)) ## much faster stopifnot(all(y == z)) ## same result } \author{Martin Schlather, \email{schlather@math.uni-mannheim.de} } \keyword{univar} \keyword{manip} RandomFieldsUtils/man/gauss.Rd0000644000176200001440000000213514227157055016037 0ustar liggesusers\name{gauss} \alias{gauss} \title{Gaussian Covariance Model} \description{ \command{gauss} is a stationary isotropic covariance model. The corresponding covariance function only depends on the distance \eqn{r \ge 0}{r \ge 0} between two points and is given by \deqn{C(r) = e^{-r^2}}{C(r)=e^{-r^2}.} } \usage{ gauss(x, derivative=0) } \arguments{ \item{x}{numerical vector; for negative values the modulus is used} \item{derivative}{value in \code{0:4}. } } \value{ If \code{derivative=0}, the function value is returned, otherwise the \code{derivative}th derivative. A vector of \code{length(x)} is returned; \code{nu} is recycled; \code{scaling} is recycled if numerical. } \references{ Gelfand, A. E., Diggle, P., Fuentes, M. and Guttorp, P. (eds.) (2010) \emph{Handbook of Spatial Statistics.} Boca Raton: Chapman & Hall/CRL. Stein, M. L. (1999) \emph{Interpolation of Spatial Data.} New York: Springer-Verlag } \me %\seealso{ For more details see \command{\link[RandomFields]{RMgauss}}.} \keyword{spatial} \keyword{models} \keyword{math} \examples{ x <- 3 confirm(gauss(x), exp(-x^2)) } RandomFieldsUtils/DESCRIPTION0000644000176200001440000000152014227516720015354 0ustar liggesusersPackage: RandomFieldsUtils Version: 1.2.5 Title: Utilities for the Simulation and Analysis of Random Fields and Genetic Data Author: Martin Schlather [aut, cre], Alexander FreudenBerg [aut], Reinhard Furrer [ctb], Martin Kroll [ctb], Brian D. Ripley [ctb], John W. Ratcliff et al. (cph) Maintainer: Martin Schlather Depends: R (>= 3.0) Imports: utils, methods, parallel Suggests: spam, RandomFields Description: Various utilities are provided that might be used in spatial statistics and elsewhere. It delivers a method for solving linear equations that checks the sparsity of the matrix before any algorithm is used. Copyright: MIT licence on sse2neon.H License: GPL (>= 3) URL: Packaged: 2022-04-18 03:42:38 UTC; schlather NeedsCompilation: yes Repository: CRAN Date/Publication: 2022-04-19 11:32:32 UTC RandomFieldsUtils/configure.ac0000644000176200001440000004007714227157055016150 0ustar liggesusersAC_INIT([RandomFieldsUtils], 1.0) ################################################## ## debugging options ################################################## #CROSS="arm32" #CROSS="avx2" #CROSS="nosimd" #CROSS="noflags" #CROSS="FALSE" #CROSS="TRUE" #CROSS="NA" #USE_GPU="try" #USE_GPU="yes" #CXX_FLAGS="-march=native" #CXX_FLAGS="-nonsense" #USERASKED="TRUE" #USERASKED="FALSE" #MEM_IS_ALIGNED="TRUE" #MEM_IS_ALIGNED="FALSE" ################################################## ## explicit options passed by RandomFieldsUtils ################################################## ## CXX_FLAGS (including omp) ## CROSS ## SIMD_FLAGS # superset of what is needed and recognized ## USE_GPU ## CUDA_HOME ## USERASKED ## MEM_IS_ALIGNED ################################################## ## package specific definitions ################################################## #MY_SSE2 #MY_SSE3 #MY_SSSE3 #MY_SSE41 #MY_AVX #MY_AVX2 #MY_AVX512F #MY_MAX_SSE2 #MY_MAX_SSE3 #MY_MAX_SSSE3 #MY_MAX_SSE41 #MY_MAX_AVX #MY_MAX_AVX2 #MY_MAX_AVX512F #MY_CU_FILES #MY_C_FILES ################################################## ## RandomFieldsUtils ################################################## ##CROSS= MY_SSE2="" MY_SSE3="" MY_SSSE3="" MY_SSE41="" MY_AVX="avx_fctns.o" MY_AVX2="avx2_fctns.o" MY_AVX512F="" MY_CU_FILES="solve_61.o gpu_info_61.o" MY_C_FILES="AutoRandomFieldsUtils.o beskf.o brdomain.o extern.o kleinkram.o maths.o options.o RFoptions.o solve.o sort.o sortLong.o utils.o win_linux_aux.o xport_import.o zzz.o gpu_info.o bckslvmodified.o cholmodified.o spamown.o obsolete.o" ################################################## ## general part ################################################## CXX=`"${R_HOME}/bin/R" CMD config CXX` AC_PROG_CXX AC_LANG(C++) SAVE_CXXFLAGS="$CXX_FLAGS" MY_CUDA_HOME="${CUDA_HOME}" if test "x${CROSS}" == x ; then CROSS=noflags elif test "x${CROSS}" == xTRUE || test "x${CROSS}" == xFALSE || test "x${CROSS}" == xNA; then CROSS_BOOL=yes fi AC_MSG_NOTICE([value of 'CROSS' is '${CROSS}'.]) if test "x${USERASKED}" == x ; then USERASKED_FLAG="" elif test "x${USERASKED}" == xTRUE ; then USERASKED_FLAG="-DUSERASKED=true" else USERASKED_FLAG="-DUSERASKED=false" fi #AC_MSG_NOTICE([value of 'MEM_IS_ALIGNED' is '${MEM_IS_ALIGNED}'.]) if test "x${MEM_IS_ALIGNED}" == x ; then MEM_IS_ALIGNED_FLAG="-DMEMisALIGNED=Nan" elif test "x${MEM_IS_ALIGNED}" == xTRUE ; then MEM_IS_ALIGNED_FLAG="-DMEMisALIGNED=True -DMEM_IS_ALIGNED" else MEM_IS_ALIGNED_FLAG="-DMEMisALIGNED=False" fi ###################################################################### ## availability of run-time checks ###################################################################### #include ; # int main(){int B=1, s[[4]];__cpuid(s, B);} ]])] CPUID_FLAG="" AC_MSG_CHECKING([whether __cpuid is available]) AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[#]include [#]include int main(){int B=1, s[[4]]; __cpuid(s, B);} ])], [CPUID_FLAG="-DWINCPUID"], []) if test "x${CPUID_FLAG}" != x ; then AC_MSG_RESULT([via intrin.h]) else AC_MSG_RESULT([no]) AC_MSG_CHECKING([whether cpuid works under asm]) AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[#]include [#]include int main(){unsigned B; uint32_t s[[4]]; asm volatile ("cpuid": "=a"(s[[0]]), "=b"(s[[1]]),"=c"(s[[2]]), "=d"(s[[3]]):"a"(B),"c"(0)); } ])], [CPUID_FLAG="-DLINUXCPUID"], []) if test "x${CPUID_FLAG}" != x ; then AC_MSG_RESULT([yes]) else AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[#]include [#]include int main(){uint32_t a,b,c,d,level=0; __cpuid(level, a, b, c, d); } ])], [CPUID_FLAG="-DMINGWCPUID"], []) if test "x${CPUID_FLAG}" != x ; then AC_MSG_RESULT([via cpuid.h]) else AC_MSG_RESULT([no]) fi fi fi ###################################################################### ## cuda ###################################################################### # Details of CUDA compilation MY_CUDA_LIBS="" #MY_CUDA="" GPU_FLAG=" -DGPU_NEEDS=Inone" if test "x${MY_CU_FILES}" != x && test "x${CPUID_FLAG}" != "x-DWINCPUID" ; then GPU_FLAG=" -DGPU_NEEDS=Igpu" if test "x${USE_GPU}" != x ; then ## both cuda and graphics card available? AC_MSG_NOTICE([value of 'USE_GPU' is '${USE_GPU}'.]) AC_MSG_CHECKING([whether CUDA_HOME is set]) if test -z "${MY_CUDA_HOME}"; then AC_MSG_RESULT([no -- using /usr/local/cuda]) MY_CUDA_HOME="/usr/local/cuda" else AC_MSG_RESULT(using CUDA_HOME=${MY_CUDA_HOME}) fi AC_CHECK_FILE([${MY_CUDA_HOME}/bin/nvcc], CUDA_INSTALLED=yes, CUDA_INSTALLED=no) if test ${CUDA_INSTALLED} == yes ; then AC_MSG_NOTICE([${CUDA_INSTALLED}]) AC_CHECK_FILE([${MY_CUDA_HOME}/lib64/libcublas.so], , AC_MSG_ERROR([this package only works with 64 bit installations of CUDA])) GPU_FLAG="-DGPU_NEEDS=Inone -DUSEGPU" MY_CUDA_LIBS="-L${MY_CUDA_HOME}/lib64 -lcudart -lcublas -lcusolver -lcusolverMg" else if test "x${USE_GPU}" == xtry ; then MY_CU_FILES="" else AC_MSG_ERROR([No CUDA installation found, install CUDA or specify CUDA_HOME.]) fi fi else AC_MSG_NOTICE(['USE_GPU' has not been set.]) MY_CU_FILES="" fi fi ###################################################################### ### SIMD ###################################################################### #AC_MSG_NOTICE([simdflags ${SIMD_FLAGS}]) if test "x${SIMD_FLAGS}" == x ; then AC_MSG_CHECKING([SIMD options for some CC files]) ## if test "x${MY_MAX_AVX}" != x || test "x${MY_MAX_AVX2}" != x || test "x${MY_MAX_AVX512F}" != x ; then ANY_MAX_AVX=yes fi if test "x${MY_SSE2}" != x || test "x${MY_MAX_SSE2}" != x || test "x${MY_MAX_SSE3}" != x || test "x${MY_MAX_SSSE3}" != x || test "x${MY_MAX_SSE41}" != x || test "x${ANY_MAX_AVX}" == xyes ; then SIMD_FLAGS="sse2 $SIMD_FLAGS" fi if test "x${MY_SSE3}" != x || test "x${MY_MAX_SSE3}" != x || test "x${MY_MAX_SSSE3}" != x || test "x${MY_MAX_SSE41}" != x || test "x${ANY_MAX_AVX}" == xyes ; then SIMD_FLAGS="sse3 $SIMD_FLAGS" fi if test "x${MY_SSSE3}" != x || test "x${MY_MAX_SSSE3}" != x || test "x${MY_MAX_SSE41}" != x || test "x${ANY_MAX_AVX}" == xyes ; then SIMD_FLAGS="ssse3 $SIMD_FLAGS" fi if test "x${MY_SSE41}" != x || test "x${MY_MAX_SSE41}" != x || test "x${ANY_MAX_AVX}" == xyes ; then SIMD_FLAGS="sse41 $SIMD_FLAGS" fi if test "x${MY_AVX}" != x || test "x${ANY_MAX_AVX}" == xyes ; then SIMD_FLAGS="avx $SIMD_FLAGS" fi if test "x${MY_AVX2}" != x || test "x${MY_MAX_AVX2}" != x || test "x${MY_MAX_AVX512F}" != x; then SIMD_FLAGS="avx2 $SIMD_FLAGS" fi if test "x${MY_AVX512F}" != x || test "x${MY_MAX_AVX512F}" != x ; then SIMD_FLAGS="avx512f $SIMD_FLAGS" fi AC_MSG_RESULT([${SIMD_FLAGS}]) fi #AC_MSG_NOTICE([simdflags ${SIMD_FLAGS}]) m4_if([$2],[],[pushdef(prog, [int main(){}])], [pushdef(prog, [$2])]) ## which of SIMD_FLAGS are recognized? if test "x${CROSS_BOOL}" == xyes ; then ## sysctl -a | grep "cpu.features:" # for OS X AC_CHECK_PROG(LSCPU, [lscpu], yes, no) if test "x${LSCPU}" == xyes ; then INFO_CPU=`lscpu | grep Flags | tr "[[:upper:]]" "[[:lower:]]"` else AC_CHECK_PROG(SYSCTL, [sysctl], yes, no) if test "x${SYSCTL}" == xyes ; then INFO_CPU=`sysctl -a 2> /dev/null | grep machdep.cpu.features | tr "[[:upper:]]" "[[:lower:]]"` fi fi AC_MSG_CHECKING([which SIMD flags can be recognized easily]) TMP=${SIMD_FLAGS} SIMD_FLAGS="" for SET in ${TMP} ; do CXXFLAGS="$SAVE_CXXFLAGS -m${SET}" ## name is obligatory info=`echo "${INFO_CPU}" | grep " $SET "` if test "x${info}" != x ; then AC_COMPILE_IFELSE([AC_LANG_SOURCE([prog])], [SIMD_FLAGS="${SIMD_FLAGS} $SET"], []) fi done if test "x${SIMD_FLAGS}" == x ; then AC_MSG_RESULT([none]) else AC_MSG_RESULT([${SIMD_FLAGS}]) USE_AVX="yes" fi elif test "x${CROSS}" != xnosimd && test "x${CROSS}" != xnoflags ; then USE_AVX="yes" fi #AC_MSG_NOTICE([simdflags ${SIMD_FLAGS}]) # which counterpart "-mno-xxx" exists? NOT_EQUAL_OR_HIGHER_FLAG="" AC_MSG_CHECKING([which downwards controls might be used]) if test "x${CROSS}" == xnoflags ; then CXXFLAGS="" elif test "x${CROSS}" == xarm32 ; then CXXFLAGS="-mfpu=neon -funsafe-math-optimizations" AC_COMPILE_IFELSE([AC_LANG_SOURCE([prog])],[CROSS_FLAG="${CXXFLAGS}"],[CROSS_FLAG=""]) else CXXFLAGS="-mno-sse2" AC_COMPILE_IFELSE([AC_LANG_SOURCE([prog])],[NOT_SSE2="${CXXFLAGS}"],[CXXFLAGS=""]) fi if test "x${USE_AVX}" != xyes ; then AC_MSG_RESULT([${NOT_SSE2}.]) else if test "x${CROSS}" != xmmx ; then DO_SSE2="-msse2" CXXFLAGS="-mno-sse3" AC_COMPILE_IFELSE([AC_LANG_SOURCE([prog])], [NOT_SSE3="${CXXFLAGS}"],[CXXFLAGS=""]) if test "x${CROSS}" != xsse2 ; then DO_SSE3="-msse3" CXXFLAGS="-mno-ssse3" AC_COMPILE_IFELSE([AC_LANG_SOURCE([prog])], [NOT_SSSE3="${CXXFLAGS}"],[CXXFLAGS=""]) if test "x${CROSS}" != xsse3 ; then DO_SSSE3="-mssse3" CXXFLAGS="-mno-sse4.1" AC_COMPILE_IFELSE([AC_LANG_SOURCE([prog])], [NOT_SSE41="${CXXFLAGS}"],[CXXFLAGS=""]) if test "x${CROSS}" != xssse3 ; then DO_SSE41="-msse4.1" CXXFLAGS="-mno-avx" AC_COMPILE_IFELSE([AC_LANG_SOURCE([prog])], [NOT_AVX="${CXXFLAGS}"],[CXXFLAGS=""]) if test "x${CROSS}" != xsse41 ; then DO_AVX="-mavx" CXXFLAGS="-mno-avx2" AC_COMPILE_IFELSE([AC_LANG_SOURCE([prog])], [NOT_AVX2="${CXXFLAGS}"],[CXXFLAGS=""]) if test "x${CROSS}" != xavx ; then DO_AVX2="-mavx2" CXXFLAGS="-mno-avx512f" ## name is obligatory AC_COMPILE_IFELSE([AC_LANG_SOURCE([prog])], [NOT_AVX512F="${CXXFLAGS}"],[CXXFLAGS=""]) if test "x${CROSS}" != xavx2 ; then DO_AVX512F="-mavx512f" if test "x${CROSS_BOOL}" == xyes || test "x${CROSS}" == xavx512f ; then CXXFLAGS="" else AC_MSG_ERROR([unrecognized CROSS option '${CROSS}']) fi fi fi fi fi fi fi fi if test "x${CROSS_BOOL}" != xyes ; then ## and USE_AVX=yes NOT_EQUAL_OR_HIGHER_FLAG="${CXXFLAGS}" fi AC_MSG_RESULT([${NOT_AVX512F} ${NOT_AVX2} ${NOT_AVX} ${NOT_SSE41} ${NOT_SSSE3} ${NOT_SSE3} ${NOT_SSE2}.]) for SET in ${SIMD_FLAGS} ; do ## set MY_xxx_FLAGS sharp to xxx (and nothing higher, ## if supported by compiler) CXXFLAGS="$SAVE_CXXFLAGS -m$SET" ## name is obligatory flag_test=0 ## test only necesary if CROSS not in {TRUE, FALSE, nosimd}, ## but performed also cases except CROSS=nosimd AC_COMPILE_IFELSE([AC_LANG_SOURCE([prog])], [flag_test=1], [flag_test=0]) if test $flag_test == 0 ; then AC_MSG_ERROR([compilation failure for '$SAVE_CXXFLAGS -m$SET']) else if test "${SET}" == sse2 ; then SSE2_FLAGS="${DO_SSE2} ${NOT_SSE3}" elif test "${SET}" == sse3 ; then SSE3_FLAGS="${DO_SSE3} ${NOT_SSSE3}" elif test "${SET}" == ssse3 ; then SSSE3_FLAGS="${DO_SSSE3} ${NOT_SSE41}" elif test "${SET}" == sse41 ; then SSE41_FLAGS="${DO_SSE41} ${NOT_AVX}" elif test "${SET}" == avx ; then AVX_FLAGS="${DO_AVX} ${NOT_AVX2}" elif test "${SET}" == avx2 ; then AVX2_FLAGS="${DO_AVX2} ${NOT_AVX512F}" elif test "${SET}" == avx512f ; then AVX512F_FLAGS="${DO_AVX512F} " else EXOTIC_SIMD_FLAGS="-m${SET} ${EXOTIC_SIMD_FLAGS}" # iteratively called fi fi done fi popdef([prog]) ## add the list of MY_MAX_xxx files to list of MY_xxx files if test "x${MY_MAX_AVX512F}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_AVX512F}" fi if test "x${DO_AVX512F}" != x && test "x${MAX_FILES}" != x ; then MY_AVX512F="${MAX_FILES} ${MY_AVX512F}" MAX_FILES="" fi if test "x${MY_MAX_AVX2}" != x ; then MAX_FILES="${MY_MAX_AVX2} ${MAX_FILES}" fi if test "x${DO_AVX2}" != x && test "x${MAX_FILES}" != x ; then MY_AVX2="${MAX_FILES} ${MY_AVX2}" MAX_FILES="" fi if test "x${MY_MAX_AVX}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_AVX}" fi if test "x${DO_AVX}" != x && test "x${MAX_FILES}" != x ; then MY_AVX="${MAX_FILES} ${MY_AVX}" MAX_FILES="" fi if test "x${MY_MAX_SSE41}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_SSE41}" fi if test "x${DO_SSE41}" != x && test "x${MAX_FILES}" != x ; then MY_SSE41="${MAX_FILES} ${MY_SSE41}" MAX_FILES="" fi if test "x${MY_MAX_SSSE3}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_SSSE3}" fi if test "x${DO_SSSE3}" != x && test "x${MAX_FILES}" != x ; then MY_SSSE3="${MAX_FILES} ${MY_SSSE3}" MAX_FILES="" fi if test "x${MY_MAX_SSE3}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_SSE3}" fi if test "x${DO_SSE3}" != x && test "x${MAX_FILES}" != x ; then MY_SSE3="${MAX_FILES} ${MY_SSE3}" MAX_FILES="" fi if test "x${MY_MAX_SSE2}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_SSE2}" fi if test "x${DO_SSE2}" != x && test "x${MAX_FILES}" != x ; then MY_SSE2="${MAX_FILES} ${MY_SSE2}" MAX_FILES="" fi #AC_MSG_NOTICE([XXXuseavx=${USEAVX} less than ${NOT_EQUAL_OR_HIGHER_FLAG}; server=${CROSS}]) ## determine the SIMD upper bound in case of CROSS=TRUE if test "x${USE_AVX}" != xyes ; then if test "x$CROSS" != xFALSE && test "x$CROSS" != xnoflags ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_SSE2}" fi elif test "x$CROSS" == xTRUE ; then if test "x${MY_AVX512F}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="" elif test "x${MY_AVX2}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_AVX512F}" elif test "x${MY_AVX}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_AVX2}" elif test "x${MY_SSE41}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_AVX}" elif test "x${MY_SSSE3}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_SSE41}" elif test "x${MY_SSE3}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_SSSE3}" elif test "x${MY_SSE2}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_SSE3}" else NOT_EQUAL_OR_HIGHER_FLAG="${NOT_SSE2}" fi fi #AC_MSG_NOTICE([useavx=${USEAVX} less than ${NOT_EQUAL_OR_HIGHER_FLAG}; server=${CROSS}]) ## prepare CROSS flags for Makevars.in if test "x$CROSS" == xTRUE ; then CROSS_FLAG="-DREQUIRED_SIMD -DCROSS_CAPACITY=${NOT_EQUAL_OR_HIGHER_FLAG}" elif test "x$CROSS" == xFALSE ; then CROSS_FLAG="-DREQUIRED_SIMD=3" elif test "x$CROSS" == xNA ; then CROSS_FLAG="-DREQUIRED_SIMD=2" elif test "x$CROSS" == xnoflags ; then CROSS_FLAG="" elif test "x$CROSS" == xarm32 ; then CROSS_FLAG="${CROSS_FLAG} -DREQUIRED_SIMD=4" ## higher 3 reservered for ARM elif test "x$CROSS" != xnosimd ; then CROSS_FLAG="-DCROSS_CAPACITY=${CROSS}" elif test "x${NOT_SSE2}" == x ; then # && CROSS = nosimd CROSS_FLAG="-DREQUIRED_SIMD=0" else ## CROSS = nosimd, no explicit limitation possible CROSS_FLAG="-DREQUIRED_SIMD=1" fi OMP="\$(SHLIB_OPENMP_CXXFLAGS)" MY_PKG_FLAGS="${SAVE_CXXFLAGS} ${CPUID_FLAG} ${GPU_FLAG} ${EXOTIC_SIMD_FLAGS} ${NOT_EQUAL_OR_HIGHER_FLAG} ${CROSS_FLAG} ${MEM_IS_ALIGNED_FLAG} ${USERASKED_FLAG}" MY_C_FILES="$MY_C_FILES $MY_SSE2 $MY_SSE3 $MY_SSSE3 $MY_SSE41 $MY_AVX $MY_AVX2 $MY_AVX512F $MAX_FILES" MY_LIB_FLAGS="$LIB_FLAGS ${OMP}" TMP="PKG_CXXFLAGS = ${MY_PKG_FLAGS} ${OMP} " # AC_MSG_NOTICE([default compilation option is ${MY_PKG_FLAGS}]) ## prepare MY_xxx for Makevars.in if test "x${USE_AVX}" == xyes ; then if test "x${MY_SSE2}" != x ; then MY_SSE2="${MY_SSE2}: ${TMP} ${SSE2_FLAGS}" fi if test "x${MY_SSE3}" != x ; then MY_SSE3="${MY_SSE3}: ${TMP} ${SSE3_FLAGS}" fi if test "x${MY_SSSE3}" != x ; then MY_SSSE3="${MY_SSSE3}: ${TMP} ${SSSE3_FLAGS}" fi if test "x${MY_SSE41}" != x ; then MY_SSE41="${MY_SSE41}: ${TMP} ${SE41_FLAGS}" fi if test "x${MY_AVX}" != x ; then MY_AVX="${MY_AVX}: ${TMP} ${AVX_FLAGS}" fi if test "x${MY_AVX2}" != x ; then MY_AVX2="${MY_AVX2}: ${TMP} ${AVX2_FLAGS}" fi if test "x${MY_AVX512F}" != x ; then MY_AVX512F="${MY_AVX512F}: ${TMP} ${AVX512F_FLAGS}" fi else MY_SSE2="" MY_SSE3="" MY_SSSE3="" MY_SSE41="" MY_AVX="" MY_AVX2="" MY_AVX512F="" fi AC_SUBST(MY_PKG_FLAGS) AC_SUBST(MY_LIB_FLAGS) AC_SUBST(MY_CUDA_LIBS) AC_SUBST(MY_CUDA_HOME) AC_SUBST(MY_SSE2) AC_SUBST(MY_SSE3) AC_SUBST(MY_SSSE3) AC_SUBST(MY_SSE41) AC_SUBST(MY_AVX) AC_SUBST(MY_AVX2) AC_SUBST(MY_AVX512F) #AC_SUBST(MY_CUDA) AC_SUBST(MY_CU_FILES) AC_SUBST(MY_C_FILES) AC_CONFIG_FILES([src/Makevars]) AC_OUTPUT RandomFieldsUtils/src/0000755000176200001440000000000014227157055014441 5ustar liggesusersRandomFieldsUtils/src/General_utils.h0000644000176200001440000000174214227157055017413 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef rfutils_general_H #define rfutils_general_H 1 #if defined OBSOLETE_RFU #include "RFU.h" #include "kleinkram.h" #else #include "errors_messages.h" #include "options.h" #endif #endif RandomFieldsUtils/src/RFU.h0000644000176200001440000000253014227157055015246 0ustar liggesusers#ifndef RFU_rfutils_h #define RFU_rfutils_h 1 #include "errors_messages.h" //#define SCALAR_RU_H 1 #define SCALAR_BASE 0 #define SCALAR_AVX 1 #define SCALAR_NEARFMA 6 // never change number, see haplogeno.R !! #define SCALAR_KAHAN 8 #define SOLVE 0 #define MATRIXSQRT 1 #define DETERMINANT 2 #define SOLVE_METHODS 3 typedef // benoetigt struct solve_storage { errorstring_type err_msg; InversionMethod method, newMethods[SOLVE_METHODS]; usr_bool sparse; int size, actual_size, actual_pivot; int nsuper; Long n_main, n_rhs, n_w2, n_U, n_D, n_w3, n_lnz, n_result; // SICH, n_MM, n_VT, n_ work, n_ nnzlindx, int *pivot_idx, n_pivot_idx, *iwork, n_iwork, //eigen, svd, LU, spam *pivotsparse, n_pivotsparse, *xlnz, n_xlnz, //spam *snode, n_snode, *xsuper, n_xsuper, *invp, n_invp, // spam *cols, n_cols, *rows, n_rows, *lindx, n_lindx, // spam *xja, n_xja; // chol, eigen, spam double *main, *rhs,// diagonal, general -- FORBIDDEN for further use *w2, // eigen, svd, LU, QR, pivot *U, // eigen, svd, pivot *D, // eigen, svd, cholesky, spam, pivot *w3, // spam, QR, svd, eigen *lnz, // spam, svd *result, // sqrtPosDefFree *to_be_deleted; } solve_storage; #define LINEAR_BASE 0 #define LINEAR_AVX 1 void linearX(double *x, double y, Long len, double *out, Long n); #endif RandomFieldsUtils/src/RFoptions.cc0000644000176200001440000010047714227157055016704 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2016 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "intrinsics.h" #define RFU_NEED_OBSOLETE 1 #include "Basic_utils.h" #include "RandomFieldsUtils.h" #include "zzz_RandomFieldsUtils.h" #include "xport_import.h" #include "kleinkram.h" #include "extern.h" extern const char *basic[basicN]; extern setparameterfct setparam[MAXNLIST]; extern getparameterfct getparam[MAXNLIST]; extern finalsetparameterfct finalparam[MAXNLIST]; extern deleteparameterfct delparam[MAXNLIST]; bool obsolete_package_in_use = false; // OBSOLETE AVAILABLE_SIMD //typedef struct getlist_type { int ListNr, i; }; // getlist_type; void setoptions(int i, int j, SEXP el, char name[LEN_OPTIONNAME], bool isList, bool local); void getoptions(SEXP sublist, int i, bool local); #define MAXNLIST 7 #define PKGNAMELENGTH 20 int NList = 0; int noption_class_list = 0, AllprefixN[MAXNLIST] = {0, 0, 0, 0, 0, 0, 0}, Allversion[MAXNLIST] = {0, 0, 0, 0, 0, 0, 0}, *AllallN[MAXNLIST] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; const char *option_class_list[MAXNLIST] = {prefixlist[1], NULL, NULL, NULL, NULL, NULL}, **Allprefix[MAXNLIST] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, ***Allall[MAXNLIST] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; name_type pkgnames = {"", "", "", "", "", "", ""}; // MAXNLIST !!! setoptions_fctn setoption_fct_list[MAXNLIST][MAXNLIST] = {{setoptions, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}}; getoptions_fctn getoption_fct_list[MAXNLIST][MAXNLIST] = {{getoptions, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}, {NULL, NULL, NULL, NULL, NULL, NULL, NULL}}; finalsetoptions_fctn finaloption_fct_list[MAXNLIST] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL }; deleteoptions_fctn deloption_fct_list[MAXNLIST] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL}; bool installed [MAXNLIST] = { false, false, false, false, false, false, false}; install_modes min_simd_needs[MAXNLIST] = // currently only used if == none or not {Inone, Inone, Inone, Inone, Inone, Inone, Inone}, min_gpu_needs[MAXNLIST] = // currently not used {Inone, Inone, Inone, Inone, Inone, Inone, Inone}; Uint simd_infos [MAXNLIST] = {0, 0, 0, 0, 0, 0, 0}; /* precide information of min_install is not used yet; just whether it is 'none' or not */ void hintVariable(char *name, int warn_unknown_option) { static bool printing = true; // da nur Mutterprozess schreiben darf if (warn_unknown_option > 0 && OPTIONS.basic.Rprintlevel > 0) { PRINTF("'%s' is considered as a variable (not as an option).\n", name); if (printing && OPTIONS.basic.helpinfo && !parallel()) { PRINTF("[This hint can be turned off by 'RFoptions(%s=-%d)'.]\n", basic[INSTALL_RUN_WARN_OPTION], MAX(1, warn_unknown_option)); printing = false; } } } void setparameter(SEXP el, char *prefix, char *mainname, bool isList, getlist_type *getlist, int warn_unknown_option, bool local, int calling) { int j = NOMATCHING, i = NOMATCHING, ListNr = NOMATCHING; char name[LEN_OPTIONNAME]; SPRINTF(name, "%.50s%.50s%.50s", prefix, STRLEN(prefix) == 0 ? "" : ".", mainname); if (STRCMP(prefix, "")) { for (ListNr=0; ListNr= 0 && STRCMP(prefix, Allprefix[k][ii]) == 0) { ListNr = k; i = ii; break; } // ii >0 } // for k if (i == MULTIPLEMATCHING) ERR1("option prefix name '%.50s' is ambiguous.", prefix); } // prefix == List j = Match(mainname, Allall[ListNr][i], AllallN[ListNr][i]); } else { // (i==0), no prefix given #define MinNameLength 3 for (ListNr=0; ListNr= 'A' && name[0] <= 'Z') { hintVariable(name, warn_unknown_option); return; } ERR1("Unknown option: '%.50s'.", name); case WARN_UNKNOWN_OPTION_ALL : default : ERR1("unknown option '%.50s'.", name); } return; } // printf("j=%d %.50s\n", j, j >=0 ? Allall[ListNr][i][j] : "multi"); if (j < 0 || STRCMP(mainname, Allall[ListNr][i][j])) { int starti = i + 1; for (int k = ListNr; k= 0 && STRCMP(mainname, Allall[k][ii][jj]) == 0) { ListNr = k; i = ii; j = jj; break; } // jj } // for ii if (j >= 0) break; } // for k } // if j < 0 || != } // no prefix given if (j<0) { if (j == NOMATCHING) ERR1("option not recognized: '%.50s'.", name) else ERR1("Multiple partial matching for option '%.50s'.", name); } if (getlist != NULL) { int k=0; while((getlist[k].ListNr != ListNr || getlist[k].i != i) && getlist[k].ListNr >= 0) k++; if (getlist[k].ListNr < 0) ERR2("Option '%.50s' not allowed for this call.\n In case you really need this option, use the command 'RFoption(%.50s=...)'", mainname, mainname); } setoptions_fctn *set = setoption_fct_list[ListNr]; if (set[ListNr] == NULL) setparam[ListNr](i, j, el, name, isList, local); else if (local && set[calling]!= NULL) set[calling](i, j, el, name, isList, local); else set[ListNr](i, j, el, name, isList, local); } void getListNr(bool save, int t, int actual_nbasic, SEXP which, getlist_type *getlist, int *Nr, int *idx // output ){ int i, ListNr; const char *z; if (save && t < noption_class_list) z = option_class_list[t]; else z = (char*) CHAR(STRING_ELT(which, t - actual_nbasic)); for (ListNr=0; ListNr= NList) ERR0("unknown value for 'getoptions_'"); if (getlist != NULL) { getlist[t].ListNr = ListNr; getlist[t].i = i; } *Nr = ListNr; *idx = i; } SEXP getRFUoptions(int ListNr, int i, bool local, int calling) { SEXP sublist, subnames; int elmts = AllallN[ListNr][i]; PROTECT(sublist = allocVector(VECSXP, elmts)); PROTECT(subnames = allocVector(STRSXP, elmts)); for (int k=0; k= NList) BUG; //printf("hier\n"); int i, j, lenlist, lensub; SEXP el, list, sublist, names, subnames, ans = R_NilValue; char *name, *pref; bool isList = false; int warn_unknown_option = OPTIONS.installNrun.warn_unknown_option, n_protect = 0; bool local = false; /* In case of strange values of a parameter, undelete the comment for PRINTF */ options = CDR(options); /* skip 'name' */ // printf("obsolete %d\n", obsolete_package_in_use); // printf(" %d %d\n", OPTIONS.installNrun.warn_unknown_option, options == R_NilValue); name = (char*) ""; if (options != R_NilValue) { if (!isNull(TAG(options))) name = (char*) CHAR(PRINTNAME(TAG(options))); if (STRCMP(name, "local_") == 0 || (STRCMP(name, "LOCAL") == 0 && obsolete_package_in_use) ) { el = CAR(options); local = (bool) INT; options = CDR(options); /* skip 'name' */ } } // printf("hierA\n"); if (options == R_NilValue || STRCMP(name, "") == 0) return getRFUoptions(local, calling); if (!isNull(TAG(options))) name = (char*) CHAR(PRINTNAME(TAG(options))); if (STRCMP(name, "warnUnknown_") == 0) { el = CAR(options); warn_unknown_option = INT; options = CDR(options); /* skip 'name' */ } // printf("hierB\n"); if (!isNull(TAG(options))) name = (char*) CHAR(PRINTNAME(TAG(options))); if ((isList = STRCMP(name, "list_") == 0 || (STRCMP(name, "LIST") == 0 && obsolete_package_in_use) )) { // printf("hierC\n"); if (local) ERR0("'list_' can be used only globally."); list = CAR(options); if (TYPEOF(list) != VECSXP) ERR1("'list_' needs as argument the output of '%.50s'", RFOPTIONS); PROTECT(names = getAttrib(list, R_NamesSymbol)); lenlist = length(list); if (lenlist > 0 && !local && parallel()) ERR0("Global 'RFoptions' such as 'cores', 'seed' and 'printlevel', may be set only outside any parallel code. See '?RandomFieldsUtils::RFoptions' for the complete list of global 'RFoptions'"); for (i=0; i 0) UNPROTECT(n_protect); if (!local) OPTIONS.basic.asList = true; // OK return(ans); } void attachSetNGet(char * callingName, char *pkgname, setoptions_fctn set, getoptions_fctn get) { int calling = 0; for ( ; calling= NList) {BUG;} for (int ListNr=0; ListNr= NList) {ERR1("package '%.50s' does not exist\n", pkgname);} return simd_infos[NList]; } */ SEXP instruction_set(SEXP which, SEXP pkgs, SEXP Uses) { #if defined ARM32 #define N_LISTE 3 const char* liste[N_LISTE] = {"CUDA", "SSE2/NEON", "SSSE3/NEON"}; Uint bit[2][N_LISTE] = {{gpuMISS, sse2MISS, ssse3MISS}, {gpuUSE, sse2USE, ssse3USE}}; #elif defined X86_64 #define N_LISTE 6 const char* liste[N_LISTE] = {"CUDA", "SSE2", "SSSE3", "AVX", "AVX2", "AVX512F"}; Uint bit[2][N_LISTE] = {{gpuMISS, sse2MISS, ssse3MISS, avxMISS, avx2MISS, avx512fMISS}, {gpuUSE, sse2USE, ssse3USE, avxUSE, avx2USE, avx512fUSE}}; #else #define N_LISTE 1 const char* liste[N_LISTE] = {"CUDA"}; Uint bit[2][N_LISTE] = {{gpuMISS}, {gpuUSE}}; #endif int rowIdx[N_LISTE], colIdx[MAXNLIST], cols = length(pkgs), rows = length(which); Uint *simd_bit = bit[LOGICAL(Uses)[0]]; if (rows == 0) rows = N_LISTE; if (cols == 0) cols = NList; if (cols > MAXNLIST) ERR0("duplicated package names or request on packages not supported by RandomFieldsUtils"); if (rows > N_LISTE) ERR0("duplicated SIMD names or request on SIMD versions not supported by "); SEXP Ans, rownames, colnames; PROTECT(rownames = allocVector(STRSXP, rows)); if (length(which) == 0) for (int i=0; i= 0) { if (colIdx[p] >= NList) BUG; Uint simd_info = simd_infos[colIdx[p]]; for (int i=0; i 0; } } else for (int i=0; i RFU_VERSION) { ERR1("An obsolete version of RandomFieldsUtils has been installed in meanwhile that is incompatible the compiled version of '%.50s'.", pkgname); } else { ERR2("Package '%.50s' has been compiled against an obsolete version of RandomFieldsUtils. Please recompile '%.50s'.", pkgname, pkgname); } } installNrun_options *ip = &(OPTIONS.installNrun); if ((usr_bool) mem_is_aligned != ip->mem_is_aligned) { #if defined SCHLATHERS_MACHINE PRINTF("mem alignment: %s=%d RFU=%d\n", pkgname, mem_is_aligned, ip->mem_is_aligned); #else if (mem_is_aligned != Nan || ip->mem_is_aligned!=True) WARN2("'%.50s' is compiled with an alignment assumption different from the package 'RandomFieldsUtils'. See MEM_IS_ALIGNED and mem_is_aligned in ?RFoptions.\n Recompile with 'RandomFieldsUtils::RFoptions(install.control=list(MEM_IS_ALIGNED=%.10s))'.", pkgname, ((usr_bool) mem_is_aligned == Nan && ip->mem_is_aligned==True) || (usr_bool) mem_is_aligned==True ? "TRUE" : "FALSE" // OK ); #endif } for (int ListNr=0; ListNr 0) { PRINTF("options starting with prefix '%s' have been already attached (%s %1.1f).", PKGprefixlist[0], pkgname, version / 10.0); } return; } } if (basicopt) option_class_list[noption_class_list++] = PKGprefixlist[0]; if (NList >= MAXNLIST) BUG; strcopyN(pkgnames[NList], pkgname, PKGNAMELENGTH); Allprefix[NList] = PKGprefixlist; AllprefixN[NList] = N; Allall[NList] = PKGall; AllallN[NList] = PKGallN; Allversion[NList] = version; setoption_fct_list[NList][NList] = set; getoption_fct_list[NList][NList] = get; finaloption_fct_list[NList] = final; deloption_fct_list[NList] = del; // printf("simd_needs %d \n", simd_needs); if ((simd_info & 1<install != Inone || !ip->installPackages); if (ip->install != Inone) ip->installPackages |= min_simd_needs[NList] > Inone; // if only gpu_needs, the system has been recompiled already, // and probably no graphic card has been found // printf("%s %d %d\n", pkgnames[NList], // ip->installPackages, min_simd_needs[NList]); NList++; PLoffset = pl_offset; basic_options *gp = &(OPTIONS.basic); PL = gp->Cprintlevel = gp->Rprintlevel + PLoffset; CORES = gp->cores; if (setRFU != NULL) { attachSetNGet(pkgname, (char *) "RandomFieldsUtils", setRFU, getRFU); } } void detachRFUoptions(const char **PKGprefixlist, int N) { int ListNr; for (ListNr=0; ListNr= NList) { ERR1("options starting with prefix '%.50s' have been already detached.", PKGprefixlist[0]); } if (deloption_fct_list[ListNr] == NULL) { if (delparam[ListNr] != NULL) delparam[ListNr](isGLOBAL); } else if (deloption_fct_list[ListNr] != NULL) deloption_fct_list[ListNr](false); int i; for (i=0; iinstallPackages = false; int zaehler =0; bool force = ip->mem_is_aligned != MEMisALIGNED ? true : LOGICAL(Force)[0]; //printf("get %d\n", NList); for (int ListNr=0; ListNrcores = 1; } else { if (min_simd_needs[0] == Inone) { CORES = gp->cores = *n; } } #else CORES = gp->cores = 1; #endif } //## 5 8, 0 8 void recompilationNeeded(int *n) { *n = (int) false; if (INSTALL_DEFAULT == Inone) return; for (int ListNr=0; ListNr required) required = min_simd_needs[ListNr]; // printf("AV ok x\n"); #if defined MSDOS_WINDOWS if (required > Inone) { PRINTF("\n\nBy default the packages are compiled without flag '-mavx' under your OS.\nIf you are sure that AVX2 is available, consider adding the flag '-march=native'\nto 'PKG_CXXFLAGS' in the file src/Makefile.win and then recompile\n'"); if (required >= Iavx2) PRINTF("Or: try adding flag '-mavx' or '-mavx2' to 'PKG_CXXFLAGS'\n"); } if (!HAS_PARALLEL) PRINTF("For OMP alone, try adding the flags -Xpreprocessor -fopenmp -pthread to PKG_LIBS and PKG_CXXFLAGS"); #elif defined ARM32 if (required > Inone) { PRINTF("\n\n install.packages(, configure.args=\"CROSS='FALSE'%s\")\n install.packages(, configure.args=\"CROSS='FALSE' USE_GPU='yes'%s\")", OMP, OMP); // OK } #else if (required > Inone) { PRINTF("\n\n install.packages(, configure.args=\"CXX_FLAGS='-march=native%s'\")\n\n install.packages(, configure.args=\"CXX_FLAGS='-march=native%s' USE_GPU='yes'\")",OMP, OMP); if (required > Iavx) { PRINTF("\n\n install.packages(, configure.args=\"CXX_FLAGS='-mavx%s'\")", OMP); if (required > Iavx2) PRINTF("\\n install.packages(, configure.args=\"CXX_FLAGS='-mavx2%s'\")", OMP); } } /* HINT && MISS_ANY_SIMD ? "\n\nAlternatively,\n install.packages(\""#PKG"\", configure.args=\"USE_AVX='yes'\") \nOr, consider installing '"#PKG"'\nfrom https://github.com/schlather/"#PKG", i.e.,\n install.packages(\"devtools\")\n library(devtools)\n devtools::install_github(\"schlather/"#PKG"/pkg\")" : "", */ #endif #if ! defined __APPLE__ if (!HAS_PARALLEL) PRINTF("\n\nFor OMP alone try\n install.packages(, configure.args=\"CXX_FLAGS='%s'\")", OMP); #endif } else { if (!STRCMP("OMP", CHAR(STRING_ELT(pkgs, 0)))) return ScalarString(mkChar(OMP)); bool all = !STRCMP("all", CHAR(STRING_ELT(pkgs, 0))); int len = all ? NList : length(pkgs); for (int i=0; i 0 && avx2Avail) PRINTF("AVX512F, "); if ((simd_info & 1< 0 && avx2Avail) PRINTF("AVX2, "); if ((simd_info & 1< 0 && avxAvail) PRINTF("AVX,"); if ((simd_info & 1< 0 && ssse3Avail) PRINTF("SSSE3, "); if ((simd_info & 1< 0 && sse2Avail) PRINTF("SSE2, "); if (!HAS_PARALLEL) PRINTF("OMP."); } PRINTF("\n"); } } } } PRINTF("\n\nOr call 'RFoptions(install=\"no\")' after loading to avoid being asked again.\n"); #else // neither ARM nor X86_64 const char OMP[80] = ""; #endif return ScalarString(mkChar(OMP)); } RandomFieldsUtils/src/zzz.c0000644000176200001440000001240314227157055015442 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #define RFU_NEED_OBSOLETE 1 #define NO_SSE2 1 #include "Basic_utils.h" #include "win_linux_aux.h" #include "RandomFieldsUtils.h" #include "Utils.h" #include "zzz_RandomFieldsUtils.h" #define none 0 #if defined(__clang__) //# pragma clang diagnostic ignored "-Wcast-function-type" #endif #ifdef __GNUC__ // https://gcc.gnu.org/onlinedocs/gcc/Diagnostic-Pragmas.html //// GCC diagnostic ignored "-Wcast-function-type" #endif static R_NativePrimitiveArgType int_arg[] = { INTSXP }, host_arg[] = { STRSXP, INTSXP}; // static R_NativeArgStyle argin[] = {R_ARG_IN}, // argout[] = {R_ARG_OUT}, // hostarg[] = {R_ARG_OUT, R_ARG_OUT}; #define CDEF(name, n, type) {#name, (DL_FUNC) & name, n, type} static const R_CMethodDef cMethods[] = { CDEF(sleepMilli, 1, int_arg), CDEF(sleepMicro, 1, int_arg), CDEF(pid, 1, int_arg), CDEF(hostname, 2, host_arg), CDEF(setCPUs, 1, int_arg), CDEF(recompilationNeeded, 1, int_arg), CDEF(loadoptions, 0, none), CDEF(detachoptions, 0, none), {NULL, NULL, 0, NULL} }; #define CALLDEF(name, n) {#name, (DL_FUNC) &name, n} static R_CallMethodDef callMethods[] = { // in die respectiven C-Dateien muss RandomFieldsUtils.h eingebunden sein CALLDEF(SIMDmessages, 1), CALLDEF(DebugCall, 0), CALLDEF(Chol, 1), CALLDEF(debuggingLevel, 0), CALLDEF(scalarR, 3), CALLDEF(SolvePosDefR, 3), CALLDEF(struve, 4), CALLDEF(besselk_simd, 2), CALLDEF(I0ML0, 1), CALLDEF(gaussr, 2), CALLDEF(WMr, 4), CALLDEF(logWMr, 4), CALLDEF(sortX, 4), CALLDEF(orderX, 4), CALLDEF(DivByRow, 2), CALLDEF(colMaxs, 1), CALLDEF(quadratic, 2), CALLDEF(dotXV, 2), CALLDEF(rowMeansX, 2), CALLDEF(rowProd, 1), CALLDEF(dbinorm, 2), CALLDEF(chol2mv, 2), CALLDEF(tcholRHS, 2), CALLDEF(crossprodX, 3), CALLDEF(getPackagesToBeInstalled, 1), CALLDEF(isGPUavailable,0), CALLDEF(isNEONavailable,0), CALLDEF(isX86_64,0), CALLDEF(gpu_info,1), CALLDEF(instruction_set, 3), // CALLDEF(), {NULL, NULL, 0} }; #define EXTDEF(name, n) {#name, (DL_FUNC) &name, n} static const R_ExternalMethodDef extMethods[] = { // in die respectiven C-Dateien muss RandomFieldsUtils.h eingebunden sein EXTDEF(RFoptions, -1), {NULL, NULL, 0} }; #define CALLABLE(FCTN) R_RegisterCCallable("RandomFieldsUtils", #FCTN, (DL_FUNC) FCTN) void R_init_RandomFieldsUtils(DllInfo *dll) { CALLABLE(del_utilsoption); CALLABLE(get_utilsoption); CALLABLE(push_utilsoption); CALLABLE(params_utilsoption); CALLABLE(solve_DELETE); CALLABLE(solve_NULL); CALLABLE(SolvePosDef); CALLABLE(SolvePosDefSp); CALLABLE(SqrtPosDefFree); CALLABLE(xCinvXdet); CALLABLE(xCinvYdet); CALLABLE(DetPosDefsp); CALLABLE(InvertMatrix); CALLABLE(cholesky); CALLABLE(DetPosDef); CALLABLE(Is_positive_definite); CALLABLE(sqrtRHS); CALLABLE(chol2inv); CALLABLE(StruveH); CALLABLE(StruveL); CALLABLE(I0mL0); CALLABLE(WM); CALLABLE(DWM); CALLABLE(DDWM); CALLABLE(D3WM); CALLABLE(D4WM); CALLABLE(logWM); CALLABLE(Gauss); CALLABLE(DGauss); CALLABLE(DDGauss); CALLABLE(D3Gauss); CALLABLE(D4Gauss); CALLABLE(logGauss); CALLABLE(attachRFUoptions); CALLABLE(detachRFUoptions); // CALLABLE(linkRFUoptions); CALLABLE(RFUoptions); CALLABLE(attachSetNGet); CALLABLE(getoptionsRFU); CALLABLE(setoptionsRFU); // OBSOLETE_RFU CALLABLE(getUtilsParam); CALLABLE(attachRFoptions); CALLABLE(detachRFoptions); CALLABLE(relaxUnknownRFoption); // obsolete CALLABLE(getErrorString); // obsolete CALLABLE(setErrorLoc); // obsolete CALLABLE(ToIntI); CALLABLE(solvePosDef); CALLABLE(solvePosDefSp); CALLABLE(sqrtPosDefFree); CALLABLE(XCinvXdet); CALLABLE(XCinvYdet); CALLABLE(detPosDefsp); CALLABLE(detPosDef); CALLABLE(invertMatrix); CALLABLE(chol); CALLABLE(is_positive_definite); CALLABLE(ordering); CALLABLE(orderingL); CALLABLE(orderingInt); CALLABLE(orderingLong); CALLABLE(sorting); CALLABLE(sortingL); CALLABLE(sortingInt); CALLABLE(sortingLong); CALLABLE(scalarX); // CALLABLE(scalarInt); CALLABLE(pid); CALLABLE(parallel); CALLABLE(sleepMicro); // problem? R_registerRoutines(dll, cMethods, callMethods, NULL, // .Fortran extMethods); R_useDynamicSymbols(dll, FALSE); // } #ifdef SCHLATHERS_MACHINE #ifdef __GNUC__ // https://gcc.gnu.org/onlinedocs/gcc/Diagnostic-Pragmas.html //// GCC diagnostic push //// GCC diagnostic ignored "-Wcast-function-type" #endif #endif void R_unload_RandomFieldsUtils(DllInfo *info) { } #ifdef __GNUC__ //// GCC diagnostic pop #endif RandomFieldsUtils/src/gpu_info_61.cu0000644000176200001440000000637114227157055017115 0ustar liggesusers/* Authors Alexander Freudenberg, afreuden@mail.uni-mannheim.de Copyright (C) 2022 -- 2022 Alexander Freudenberg This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* This file contains helper functions to show information on the available GPU devices. It can be dynamically loaded in an R script with dyn.load Compilation: // nvcc -Xcompiler -fpic -I /usr/share/R/include --shared gpu_info.cu -o gpu_info.so */ #include #include #include #include #include "Basic_utils.h" #include "errors_messages.h" #define NUMBER_OF_ATTRIBUTES 5 SEXP gpu_info_61(SEXP DEVICES){ int N_GPUS = length(DEVICES), protects = 0, count = 0, i = 0, device_number = -1, *devices = INTEGER(DEVICES); size_t free_mem = 0, total_mem = 0; double scaling_constant = 1024.0 * 1024.0 * 1024.0; cudaDeviceProp deviceProp; SEXP gpu_info_list; SEXP list_element; SEXP dimnames = PROTECT(allocVector(VECSXP, NUMBER_OF_ATTRIBUTES)); protects++; // define dimnames of gpu_info_list SET_VECTOR_ELT(dimnames, 0, mkChar("Device Number")); SET_VECTOR_ELT(dimnames, 1, mkChar("Device Name")); SET_VECTOR_ELT(dimnames, 2, mkChar("Compute Capability")); SET_VECTOR_ELT(dimnames, 3, mkChar("Free Memory")); SET_VECTOR_ELT(dimnames, 4, mkChar("Total Memory")); // check if any device is out of bound cudaGetDeviceCount(&count); if(count == 0) ERR0("No CUDA devices found.\n"); for(i=0; i= count) ERR1("Device out of bound: %d\n", device_number); } // allocate info list PROTECT(gpu_info_list = allocVector(VECSXP, N_GPUS)); protects++; // Fill list with device info for(i = 0; i ans[j]) ans[j] = w; } } } PutRNGstate(); UNPROTECT(1); return Ans; } // #endif // SCHLATHERS_MACHINE RandomFieldsUtils/src/AutoRandomFieldsUtils.cc0000644000176200001440000000320214227157055021166 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de main library for unconditional simulation of random fields Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "AutoRandomFieldsUtils.h" const char *R_TYPE_NAMES[LAST_R_TYPE_NAME + 1] = { // never change ! see kleinkram.cc "NILSXP" /* 0 */, "SYMSXP", "LISTSXP", "CLOSXP", "ENVSXP", "PROMSXP", "LANGSXP", "SPECIALSXP", "BUILTINSXP", "CHARSXP", "LGLSXP" /* 10 */, "??", "??", "INTSXP", "REALSXP", "CPLXSXP", "STRSXP", "DOTSXP", "ANYSXP", "ECSXP", "EXPRSXP" /*20 */, "BCODESXP", "EXTPTRSXP", "WEAKREFSXP", "RAWSXP", "S4SXP" /* 25 */, "", "", "", "", "NEWSXP" /* 30 */, "FREESXP", "??SXP"}, *LA_NAMES[LA_LAST + 1] = { "intern", "R", "auto", "GPU", "query"}, *PIVOT_NAMES[PIVOT_LAST + 1] = {"no privoting", "do", "auto", "idx", "undefined"}, *INSTALL_NAMES[INSTALL_LAST + 1] = {"no installation", "install", "ask", "sse", "sse2", "sse3", "ssse3", "avx", "avx2", "avx512f", "gpu"}; RandomFieldsUtils/src/def.h0000644000176200001440000000207614227157055015355 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /// sysconf (_SC_NPROCESSORS_ONLN) // number of cores available // int get_nprocs (void) // dito #ifndef RFUdef_H #define RFUdef_H 1 //// 1 //// 1 //// 1 #if ! defined SCHLATHERS_MACHINE && defined SCHLATHER_DEBUGGING #undef SCHLATHER_DEBUGGING #else //// 1 #endif // // 1 #endif RandomFieldsUtils/src/utils.cc0000644000176200001440000005115014227157055016112 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Collection of system specific auxiliary functions Copyright (C) 2001 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURSE. See the GNU General Public License for more details. g You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "Basic_utils_local.h" #include #include #include "RandomFieldsUtils.h" #include "kleinkram.h" #include "zzz_RandomFieldsUtils.h" #include "Utils.h" #include "xport_import.h" #include "extern.h" AVAILABLE_SIMD double *ToRealI(SEXP X, bool *create) { KEY_type *KT = KEYT(); if (TYPEOF(X) == REALSXP) { *create = false; return REAL(X); } // TO DO !! //HELPINFO("Better use 'double' as storage mode (for one of the arguments)."); int len = length(X); double *y; if (create || KT->ToRealN < len) { y = (double *) MALLOC(sizeof(double) * len); if (y == NULL) ERR1("not enough memory for an %d vector of doubles", len); if (!create) { FREE(KT->ToRealDummy); KT->ToRealDummy = y; KT->ToRealN = len; } } else y = KT->ToRealDummy; int *x; if (TYPEOF(X)==INTSXP) x=INTEGER(X); else x=LOGICAL(X); for (int i=0; i 100 || PL > 1) // HELPINFO("Better use 'integer' as storage mode (for one of the arguments)."); int *y; if (*create || KT->ToIntN < len) { y = (int *) MALLOC(sizeof(int) * len); if (y == NULL) ERR1("not enough memory for an %d vector of integers", len); if (!*create) { FREE(KT->ToIntDummy); KT->ToIntDummy = y; KT->ToIntN = len; } } else y = KT->ToIntDummy; double *x = (double *) REAL(X); if (round) for (int i=0; i=(uintptr_t)X); return (double *) algn_general(X); } #if defined SSE41 || defined AVX2 int static inline *algnInt(int *X) { assert(algn_general(X)>=(uintptr_t)X); return (int *) algn_general(X); } #endif void colMaxsIint(int *M, Long r, Long c, int *ans) { if (r < 32 #if defined AVX2 || !avx2Avail #elif defined SSE41 || !sse41Avail #endif ) { for (Long i=0; iglobal_utils.basic.cores; Long nrow = nrows(AA), ncol = ncols(AA), dim = length(X), k = MIN(ncol / 2, nrow), m = MAX(ncol, nrow); double eps = 1e-14, *A = REAL(AA), *C = REAL(CC), *x = REAL(X), z[2], *a[2] = {(double*) MALLOC(m * m * sizeof(double)), (double*) MALLOC(m * m * sizeof(double))}; if (ncols(CC) != nrows(CC) || ncols(CC) != ncol) BUG; if (length(X) != nrow) BUG; for (int i=0; i<=17; i++) { for (int j=0; j<=1; j++) { extern bool obsolete_package_in_use; SetLaMode(j == 0 || obsolete_package_in_use ? LA_INTERN : LA_R, cores); switch(i) { case 1: z[j] = XkCXtl(A, C, nrow, ncol, nrow / 3, nrow / 4, cores); break; case 2: XCXt(A, C, a[j], nrow, ncol, cores); break; case 3: AtA(A, nrow, ncol, a[j], cores); break; case 4: xA(x, A, nrow, ncol, a[j], cores); break; case 5: xA_noomp(x, A, nrow, ncol, a[j]); break; // case : xA(x1, x2, A, nrow, ncol, a[j]1, a[j]2); break; case 6: z[j] = xAx(x, C, nrow, cores); break; case 7: Ax(A, C, nrow, ncol, a[j], cores); break;// C genuegend lang. Reicht. // case 8: Ax(A, x, x2, nrow, ncol, a[j]1, a[j]2); break; case 8: z[j] =xUy(x, C, A, dim, cores); break; // A genuegend lang. Reicht. case 9: z[j] =xUxz(x, C, dim, a[j], cores); break; case 10: z[j] =x_UxPz(x, C, A, dim,cores); break; // A genuegend lang. Reicht. case 11: z[j] =xUx(x, C, dim, cores); break; case 12: matmult(A, C, a[j], nrow, ncol, k, cores); break; case 13: matmulttransposed(A, C, a[j], ncol, nrow, k, cores); break; //case : matmulttransposedInt(int *A, int *B, int *c, ncol, ncol, k); break; case 14: matmult_2ndtransp(A, C, a[j], nrow, ncol, k, cores); break; case 15: matmult_2ndtransp(A, C, a[j], nrow, ncol, cores); break; case 16: matmult_tt(A, C, a[j], ncol, nrow, k,cores); break; // case 17: z[j]= scalar(A, C, ncol); break; default: BUG; } int size = 0; switch(i) { case 1: case 6: case 8: case 9:case 10: case 11: case 17: if (FABS(z[0] - z[1])> eps) { PRINTF("i=%d", i); BUG; } break; case 2: size = ncol * ncol; break; case 3: case 15: size = nrow * nrow; break; case 4 : case 5: size = ncol; break; case 7 : size = nrow; break; case 12: case 13: case 14: case 16: size = nrow * k; break; default: BUG; } for (int p=0; p eps) { PRINTF("i=%d, %d", i, p); BUG; } } } FREE(a[0]); FREE(a[1]); return R_NilValue; } SEXP quadratic(SEXP A, SEXP x) { KEY_type *KT = KEYT(); int cores = KT->global_utils.basic.cores; SEXP ans; int len = length(x); if (len != nrows(A) || len != ncols(A)) ERR0("'x' and 'A' do not match."); PROTECT(ans = allocVector(REALSXP, 1)); REAL(ans)[0] = xAx(REAL(x), REAL(A), len, cores); UNPROTECT(1); return ans; } SEXP dotXV(SEXP M, SEXP V) { Long r = nrows(M), c = ncols(M), l = length(V) ; if (l != r) ERR0("X and v do not match"); if (r == 0) return R_NilValue; SEXP Ans; PROTECT(Ans = allocMatrix(REALSXP, r, c)); // bringt nix //#ifdef DO_PARALLEL //#p ragma omp parallel for num_threads(CORES) //#endif for (Long i=0; in_data_names == 0) xor (KT->data_names != NULL)); // assert((KT->n_coord_names == 0) xor (KT->coord_names != NULL)); // assert((KT->n_data_idx == 0) xor (KT->data_idx != NULL)); // assert((KT->n_coord_idx == 0) xor (KT->coord_idx != NULL)); return R_NilValue; } #define Nmodi 9 name_type modi = { "1x1", "2x2", "4x4", "8x8", "near", "simple", "precise", "kahan", "1x1p"}; double scalarprod( double * v1, double * v2, Long N){ double *endv1 = v1 + N, sum = 0; for(; v1!= endv1; v1++, v2++) sum += v2[0] * v1[0]; return sum; } double scalarprod2by2( double * v1, double * v2, Long N){ double *endv1 = v1 + (N / 2) * 2, *end = v1 + N, sum = 0; for(; v1 < endv1; v1 += 2, v2 += 2) sum += v2[0] * v1[0] + v2[1] * v1[1]; if (v1 < end) sum += v2[0] * v1[0]; return sum; } double scalarprod4by4( double * v1, double * v2, Long N){ // printf("4by4 %d %d %d\n", sse, sse2, avx); double*endv1 = v1 + (N / 4) * 4, *end = v1 + N, sum = 0; for(; v1 < endv1; v1 += 4, v2 += 4) sum += v2[0] * v1[0] + v2[1] * v1[1] + v2[2] * v1[2]+ v2[3] * v1[3]; for(; v1 < end; v1++, v2++) sum += v2[0] * v1[0]; return sum; } double scalarprod8by8( double * v1, double * v2, Long N){ double *endv1 = v1 + (N / 8) * 8, *end = v1 + N, sum = 0.0; for(; v1 < endv1; v1 += 8, v2 += 8) sum += v2[0] * v1[0] + v2[1] * v1[1]+ v2[2] * v1[2] + v2[3] * v1[3] + v2[4] * v1[4] + v2[5] * v1[5]+ v2[6] * v1[6]+ v2[7] * v1[7]; for(; v1 < end; v1++, v2++) sum += v2[0] * v1[0]; return sum; } void avx_scalarprodM(double * x, double * y, Long len, double *res); double avx_scalarprodDnearfma(double * x, double * y, Long len); double avx_scalarprodD(double * x, double * y, Long L); double avx_scalarprodDopt(double * x, double * y, Long L); double avx_scalarprodDP(double * x, double * y, Long L) ; double avx_scalarprodDK(double * x, double * y, Long L); double scalarX(double *x, double *y, Long len, Long n) { // parallel lohnt i.A. nicht: 28.2.20121 alles was parallel ist, rausgeworfen assert(n >= 0); // __m128 a, b; a = _mm_add_ps ((__m128) a, (__m128) b); // __m128i c, d; c = _mm_add_epi16 ((__m128i) c, (__m128i) d); // __m256d e, f; e = _mm256_add_pd ( e, f); // printf("n=%d %d ",n, avx); switch(n) { // printf("%d\n", n); case SCALAR_AVX : // printf(" # %d ", avx); if (avxAvail) return avx_scalarprodD(x, y, len); // best one kernel break; case 2 : return scalarprod(x, y, len); case 3 : return scalarprod2by2(x, y, len); case 4 : return scalarprod8by8(x, y, len); // case 5 : //#ifdef FMA_AVAILABLE // return avx_scalarprodDfma(x, y, len); //#endif case SCALAR_NEARFMA : if (avxAvail) return avx_scalarprodDnearfma(x, y, len); break; case 7 : if (avxAvail) return avx_scalarprodDP(x, y, len); //best break; case SCALAR_KAHAN : if (avxAvail) return avx_scalarprodDK(x, y, len); // kahan break; /* case 10: if (avx) return avx_scalarprodDopt(x, y, len); // best one kernel break; case 11: double result[6]; if (avx) { avx_scalarprodM(x, y, len, result); // best one kernel return result[0]; } break; */ case SCALAR_BASE : default : {} } return scalarprod4by4(x, y, len); } SEXP scalarR(SEXP x, SEXP y, SEXP Mode) { // unused Long len = length(x); if (length(y) != len) ERR0("x and y differ in length"); int mode; if (length(Mode) == 0) mode = -1; else if (INTSXP==TYPEOF(Mode)) mode = INTEGER(Mode)[0]; else mode = Match((char*) CHAR(STRING_ELT(Mode, 0)), modi, Nmodi); SEXP Ans; if (isMatrix(x)) { Long nc = ncols(x); PROTECT(Ans = allocVector(REALSXP, nc * (nc - 1) / 2)); double *ans = REAL(Ans); *ans = scalarX(REAL(x), REAL(y), len, 11); // no PROTECT( needed UNPROTECT(1); } else { PROTECT(Ans = allocVector(REALSXP, 1)); double *ans = REAL(Ans); *ans = scalarX(REAL(x), REAL(y), len, mode); // no PROTECT( needed UNPROTECT(1); } return Ans; } SEXP crossprodX(SEXP X, SEXP Y, SEXP mode) { KEY_type *KT = KEYT(); int cores = KT->global_utils.basic.cores; Long n, nrow, len, lenY, ncol; if (isMatrix(X)) { nrow = ncols(X); len = nrows(X); } else { nrow = 1; len = length(X); } if (isMatrix(Y)) { ncol = ncols(Y); lenY = nrows(Y); } else { ncol = 1; lenY = length(Y); } if (lenY != len) ERR0("sizes of 'x' and 'y' do not match"); if (length(mode) == 0) n = SCALAR_DEFAULT; else { n = INTEGER(mode)[0]; if (n < 0) n = SCALAR_DEFAULT; } SEXP Ans; PROTECT(Ans = allocMatrix(REALSXP, nrow, ncol)); double *ans = REAL(Ans), *x = REAL(X), *y = REAL(Y); if (x == y) AtA(x, len, ncol, ans, cores); else matmulttransposed(x, y, ans, len, nrow, ncol, cores); UNPROTECT(1); return Ans; } void avx_linearprodD( double * v1, double v2, Long N, double *inout); void linearprod2by2( double * v1, double v2, Long N, double *inout){ double *endv1 = v1 + (N / 2) * 2, *end = v1 + N; for(; v1 < endv1; v1+=2, inout+=2) { inout[0] += v2 * v1[0]; inout[1] += v2 * v1[1]; } if (v1 < end) inout[0] += v2 * v1[0]; } void linearX(double *x, double y, Long len, double *inout, Long n) { switch(n) { case LINEAR_AVX : if (avxAvail) { avx_linearprodD(x, y, len, inout); return; } break; case LINEAR_BASE: default : {} } linearprod2by2(x, y, len, inout); } RandomFieldsUtils/src/sortLong.cc0000644000176200001440000002646714227157055016576 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2017 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "Basic_utils_local.h" // must be before anything else #include "RandomFieldsUtils.h" #include "zzz_RandomFieldsUtils.h" typedef bool (*vergleich)(Long, Long, void *O); bool smaller1L(Long i, Long j, void *orderd) { return ((double *) orderd)[i] < ((double *) orderd)[j]; } bool greater1L(Long i, Long j, void *orderd) { return ((double *) orderd)[i] > ((double *) orderd)[j]; } bool smallerLong1(Long i, Long j, void *orderedLong) { return ((Long *) orderedLong)[i] < ((Long *)orderedLong)[j]; } bool greaterLong1(Long i, Long j, void *orderedLong) { return ((Long *)orderedLong)[i] > ((Long *)orderedLong)[j]; } typedef bool (*vergleichX)(Long, Long, int, void *O); // vergleichX SMALLERXLong=NULL, GREATERXLong=NULL; bool smallerL(Long i, Long j, int orderDim, void *O) { double *x, *y, *orderd = (double*) O; x = orderd + i * orderDim; y = orderd + j * orderDim; for(Long d=0; d y[d]; return false; } bool smallerLong(Long i, Long j, int orderDim, void *O) { Long *x, *y, *orderedLong = (Long*) O; x = orderedLong + i * orderDim; y = orderedLong + j * orderDim; for(Long d=0; d y[d]; return false; } void orderLong(Long *pos, Long start, Long end, vergleich SMALLER, vergleich GREATER, void * orderd, Long order_from, Long order_to) { Long randpos, pivot, left, right, pivotpos; if( start < end ) { randpos = (start + end) / 2; pivot = pos[randpos]; pos[randpos] = pos[start]; pos[start] = pivot; pivotpos=start; left = start; right=end+1; while (left < right) { while (++left < right && SMALLER(pos[left], pivot, orderd)) pivotpos++; while (--right > left && GREATER(pos[right], pivot, orderd)); if (left < right) { Long swap=pos[left]; pos[left]=pos[right]; pos[right]=swap; pivotpos++; } } pos[start] = pos[pivotpos]; pos[pivotpos] = pivot; if (start <= order_to && pivotpos > order_from) orderLong(pos, start, pivotpos-1, SMALLER, GREATER, orderd, order_from, order_to); if (pivotpos < order_to && end >= order_from) orderLong(pos, pivotpos + 1, end, SMALLER, GREATER, orderd, order_from, order_to); } } void XorderLong(Long *pos, Long start, Long end, vergleichX SMALLER, vergleichX GREATER, Long D, void * orderd, Long order_from, Long order_to ) { Long randpos, pivot, left, right, pivotpos; if( start < end ) { randpos = (start + end) / 2; pivot = pos[randpos]; pos[randpos] = pos[start]; pos[start] = pivot; pivotpos=start; left = start; right=end+1; while (left < right) { while (++left < right && SMALLER(pos[left], pivot, D, orderd)) pivotpos++; while (--right > left && GREATER(pos[right], pivot, D, orderd)); if (left < right) { Long swap=pos[left]; pos[left]=pos[right]; pos[right]=swap; pivotpos++; } } pos[start] = pos[pivotpos]; pos[pivotpos] = pivot; if (start <= order_to && pivotpos > order_from) XorderLong(pos, start, pivotpos-1, SMALLER, GREATER, D, orderd, order_from, order_to); if (pivotpos < order_to && end >= order_from) XorderLong(pos, pivotpos + 1, end, SMALLER, GREATER, D, orderd, order_from, order_to); } } void orderingFromToL(double *d, Long len, int dim, Long *pos, Long from, Long to, usr_bool NAlast) { Long start, end; if (NAlast == Nan) { for (Long i=0; i left && orderd[right] > pivot); if (left < right) { double swap = orderd[left]; orderd[left]=orderd[right]; orderd[right]=swap; pivotpos++; } } orderd[start] = orderd[pivotpos]; orderd[pivotpos] = pivot; if (start <= order_to && pivotpos > order_from) quicksortL(start, pivotpos-1, orderd, order_from, order_to); if (pivotpos < order_to && end >= order_from) quicksortL(pivotpos + 1, end, orderd, order_from, order_to); } } void sortingFromToL(double *d, Long len, Long from, Long to, usr_bool NAlast) { Long start, end; if (NAlast == Nan) { end = len-1; start = 0; } if (NAlast == True) { start = end = 0; Long NAend = len - 1; while (end < NAend) { while (NAend >= 0 && (ISNA(d[NAend]) || ISNAN(d[NAend]))) NAend--; while (end < NAend && !ISNA(d[end]) && !ISNAN(d[end])) end++; if (end < NAend) { double swap = d[end]; d[end] = d[NAend]; d[NAend--] = swap; } } assert(NAend == end && false); } else { // if (NAlast == False) { start = end = len - 1; Long NAstart = 0; while (start > NAstart) { while(NAstart < len && (ISNA(d[NAstart]) || ISNAN(d[NAstart]))) NAstart++; while (start > NAstart && !ISNA(d[start]) && !ISNAN(d[start])) start--; if (start > NAstart) { double swap = d[start]; d[start] = d[NAstart]; d[NAstart++] = swap; } } assert(NAstart == start); } quicksortL(start, end, d, from - 1, to - 1); } void sortingL(double *d, Long len, usr_bool NAlast) { sortingFromToL(d, len, 1, len, NAlast); } void sortLong(Long start, Long end, Long *orderedLong, Long order_from, Long order_to) { Long left, right, pivotpos; if( start < end ) { Long randpos = (start + end) / 2; Long pivot = orderedLong[randpos]; orderedLong[randpos] = orderedLong[start]; orderedLong[start] = pivot; pivotpos=start; left = start; right = end+1; while (left < right) { while (++left < right && orderedLong[left] < pivot) pivotpos++; while (--right > left && orderedLong[right] > pivot); if (left < right) { Long swap = orderedLong[left]; orderedLong[left]=orderedLong[right]; orderedLong[right]=swap; pivotpos++; } } orderedLong[start] = orderedLong[pivotpos]; orderedLong[pivotpos] = pivot; if (start <= order_to && pivotpos > order_from) sortLong(start, pivotpos-1, orderedLong, order_from, order_to); if (pivotpos < order_to && end >= order_from) sortLong(pivotpos + 1, end, orderedLong, order_from, order_to); } } void sortingLongFromTo(Long *d, Long len, Long from, Long to, usr_bool NAlast){ /* quicksort algorithm, slightly modified: does not sort the data, but d[pos] will be ordered NOTE: pos must have the values 0,1,2,...,start-end ! (orderdouble is a kind of sorting pos according to the variable d) */ Long start, end; if (NAlast == Nan) { end = len-1; start = 0; } if (NAlast == True) { start = end = 0; Long NAend = len - 1; while (end < NAend) { while (NAend >= 0 && d[NAend] == NA_LONG) NAend--; while (end < NAend && d[end] != NA_LONG) end++; if (end < NAend) { Long swap = d[end]; d[end] = d[NAend]; d[NAend--] = swap; } } assert(NAend == end && false); } else { // if (NAlast == False) { start = end = len - 1; Long NAstart = 0; while (start > NAstart) { while(NAstart < len && d[NAstart] == NA_LONG) NAstart++; while (start > NAstart && d[start] != NA_LONG) start--; if (start > NAstart) { Long swap = d[start]; d[start] = d[NAstart]; d[NAstart++] = swap; } } assert(NAstart == start); } sortLong(start, end, d, from - 1, to - 1); } void sortingLong(Long *d, Long len, usr_bool NAlast) { sortingLongFromTo(d, len, 1, len, NAlast); } RandomFieldsUtils/src/RandomFieldsUtils.h0000644000176200001440000000434414227157055020207 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef RFutils_public_H #define RFutils_public_H 1 #ifdef __cplusplus extern "C" { #endif SEXP scalarR(SEXP x, SEXP y, SEXP mode); SEXP struve(SEXP X, SEXP Nu, SEXP Factor_Sign, SEXP Expscaled); SEXP besselk_simd(SEXP X, SEXP Nu); SEXP I0ML0(SEXP X); SEXP gaussr(SEXP X, SEXP Derivative); SEXP WMr(SEXP X, SEXP Nu, SEXP Derivative, SEXP Factor); SEXP logWMr(SEXP X, SEXP Nu1, SEXP Nu2, SEXP Factor); SEXP SolvePosDefR(SEXP M, SEXP rhs, SEXP logdet); SEXP Chol(SEXP M); SEXP RFoptions(SEXP options); void loadoptions(); void detachoptions(); SEXP sortX(SEXP Data, SEXP From, SEXP To, SEXP NAlast); SEXP orderX(SEXP Data, SEXP From, SEXP To, SEXP NAlast); SEXP colMaxs(SEXP M); SEXP rowMeansX(SEXP M, SEXP Factor); SEXP rowProd(SEXP M); SEXP chol2mv(SEXP Chol, SEXP N); SEXP tcholRHS(SEXP C, SEXP RHS); SEXP DivByRow(SEXP M, SEXP V); SEXP quadratic(SEXP x, SEXP A); SEXP dbinorm(SEXP X, SEXP Sigma); SEXP dotXV(SEXP M, SEXP V); // void Ordering(double *d, int *len, int *dim, int *pos); SEXP crossprodX(SEXP X, SEXP Y, SEXP mode); SEXP DebugCall(); SEXP getPackagesToBeInstalled(SEXP Force); SEXP isGPUavailable(); SEXP isNEONavailable(); SEXP isX86_64(); void setCPUs(int *n); void recompilationNeeded(int *n); SEXP SIMDmessages(SEXP pkgs); SEXP debuggingLevel(); SEXP gpu_info(SEXP DEVICES); SEXP instruction_set(SEXP which, SEXP pkgs, SEXP used); #ifdef __cplusplus } #endif #endif RandomFieldsUtils/src/errors_messages.h0000644000176200001440000001415014227157055020016 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ // Datei wi #ifndef rfutils_error_H #define rfutils_error_H 1 #define NOERROR 0 #define ERRORMEMORYALLOCATION 1 #define ERRORFAILED 2 /* method didn't work for the specified parameters */ #define ERRORNOTPROGRAMMEDYET 3 #define ERRORM 4 /* a single error message */ #define ERRORMEND 12 /* a single error message -- und alles dazwischen */ #ifdef SCHLATHERS_MACHINE #define ERRLINE PRINTF("(ERROR in %s, line %d)\n", __FILE__, __LINE__) #else #define ERRLINE #endif #ifndef ERR #define ERR ERR0 #endif #define ERR0(X) {ERRLINE; RFERROR(X);} #define ERR00(X) ERRLINE; errorstring_type E_AUX; #define ERR1(X,Y) {ERR00(X);SPRINTF(E_AUX,X,Y); RFERROR(E_AUX);} #define ERR2(X,Y,Z) {ERR00(X);SPRINTF(E_AUX,X,Y,Z); RFERROR(E_AUX);} #define ERR3(X,Y,Z,A) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A); RFERROR(E_AUX);} #define ERR4(X,Y,Z,A,B) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A,B); RFERROR(E_AUX);} #define ERR5(X,Y,Z,A,B,C) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A,B,C); RFERROR(E_AUX);} #define ERR6(X,Y,Z,A,B,C,D) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A,B,C,D); RFERROR(E_AUX);} #define ERR7(X,Y,Z,A,B,C,D,E) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A,B,C,D,E); RFERROR(E_AUX);} #define ERR8(X,Y,Z,A,B,C,D,E,F) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A,B,C,D,E,F); RFERROR(E_AUX);} #ifndef LOCAL_ERRORSTRING #define LOCAL_ERRORSTRING errorstring_type loc_errorstring #endif #ifndef WHICH_ERRORSTRING #define WHICH_ERRORSTRING loc_errorstring #endif #define FERR0(X) LOCAL_ERRORSTRING; \ STRNCPY(WHICH_ERRORSTRING, X, MAXERRORSTRING); DEBUGINFOERR #if ! defined FERR #define FERR FERR0 #endif #define FERR1(X,Y) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING, X, Y); DEBUGINFOERR #define FERR2(X,Y,Z) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING, X, Y, Z); DEBUGINFOERR #define FERR3(X,Y,Z,A) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING, X, Y, Z, A); DEBUGINFOERR #define FERR4(X,Y,Z,A,B) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING,X,Y,Z,A,B); DEBUGINFOERR #define FERR5(X,Y,Z,A,B,C) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING,X,Y,Z,A,B,C); DEBUGINFOERR #define FERR6(X,Y,Z,A,B,C,D) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING,X,Y,Z,A,B,C,D); DEBUGINFOERR #define FERR7(X,Y,Z,A,B,C,D,E) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING,X,Y,Z,A,B,C,D,E); DEBUGINFOERR #ifndef LOCAL_ERROR #define LOCAL_ERROR(N) {} #endif #define NERR00(N) LOCAL_ERROR(N); return N; #define NERR0(N,X) { FERR0(X); NERR00(N)} #if ! defined NERR #define NERR NERR0 #endif #define NERR1(N,X,Y) { FERR1(X, Y); NERR00(N)} #define NERR2(N,X, Y, Z) { FERR2(X, Y, Z); NERR00(N)} #define NERR3(N,X, Y, Z, A) { FERR3(X, Y, Z, A); NERR00(N)} #define NERR4(N,X, Y, Z, A, B) { FERR4(X, Y, Z, A, B); NERR00(N)} #define NERR5(N,X, Y, Z, A, B, C) { FERR5(X, Y, Z, A, B, C); NERR00(N)} #define NERR6(N,X, Y, Z, A, B, C, D) { FERR6(X, Y, Z, A,B,C,D); NERR00(N)} #define NERR7(N,X,Y,Z, A, B, C, D, E) { FERR7(X,Y,Z,A,B,C,D,E); NERR00(N)} #define SERR0(X) NERR0(ERRORM, X) #if ! defined SERR #define SERR SERR0 #endif #define SERR1(X,Y) NERR1(ERRORM, X, Y) #define SERR2(X,Y,Z) NERR2(ERRORM, X, Y, Z) #define SERR3(X,Y,Z, A) NERR3(ERRORM, X, Y, Z, A) #define SERR4(X,Y,Z, A, B) NERR4(ERRORM, X, Y, Z, A, B) #define SERR5(X,Y,Z, A, B, C) NERR5(ERRORM, X, Y, Z, A, B, C) #define SERR6(X,Y,Z, A, B, C, D) NERR6(ERRORM, X, Y, Z, A, B, C, D) #define SERR7(X,Y,Z, A, B, C, D, E) NERR7(ERRORM, X, Y, Z, A, B, C, D, E) #define CERR00 err=ERRORM; continue; #define CERR0(X) { FERR0(X); CERR00} #if ! defined CERR #define CERR CERR0 #endif #define CERR1(X,Y) { FERR1(X, Y); CERR00} #define CERR2(X, Y, Z) { FERR2(X, Y, Z); CERR00} #define CERR3(X, Y, Z, A) { FERR3(X, Y, Z, A); CERR00} #define GERR00 LOCAL_ERROR(ERRORM); err = ERRORM; goto ErrorHandling; #define GERR0(X) {FERR0(X); GERR00} #if ! defined GERR #define GERR GERR0 #endif #define GERR1(X,Y) {FERR1(X,Y); GERR00} #define GERR2(X,Y,Z) {FERR2(X,Y,Z); GERR00} #define GERR3(X,Y,Z,A) {FERR3(X,Y,Z,A); GERR00} #define GERR4(X,Y,Z,A,B) {FERR4(X,Y,Z,A,B); GERR00} #define GERR5(X,Y,Z,A,B,C) {FERR5(X,Y,Z,A,B,C); GERR00} #define GERR6(X,Y,Z,A,B,C,D) {FERR6(X,Y,Z,A,B,C,D); GERR00} #define GNERR00(N) err = N; goto ErrorHandling; #define GNERR0(N,X) {FERR0(X); GNERR00(N)} #if ! defined GNERR #define GNERR GNERR0 #endif #define GNERR1(N,X,Y) {FERR1(X,Y);GNERR00(N)} #define GNERR2(N,X,Y,Z) {FERR2(X,Y,Z); GNERR00(N)} #define GNERR3(N,X,Y,Z,A) {FERR3(X,Y,Z,A); GNERR00(N)} #define GNERR4(N,X,Y,Z,A,B) {FERR4(X,Y,Z,A,B); GNERR00(N)} #define GNERR5(N,X,Y,Z,A,B,C) {FERR5(X,Y,Z,A,B,C); GNERR00(N)} #define GNERR6(N,X,Y,Z,A,B,C,D) {FERR6(X,Y,Z,A,B,C,D); GNERR00(N)} #define RFWARNING warning #define WARN0 RFWARNING #define WARN1(X, Y) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y); RFWARNING(W_MSG);} #define WARN2(X, Y, Z) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y, Z); RFWARNING(W_MSG);} #define WARN3(X, Y, Z, A) {errorstring_type W_MSG;\ SPRINTF(W_MSG, X, Y, Z, A); RFWARNING(W_MSG);} #define WARN4(X, Y, Z, A, B) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y, Z, A, B); RFWARNING(W_MSG);} #define WARN5(X, Y, Z, A, B, C) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y, Z, A, B, C); RFWARNING(W_MSG);} #define WARN6(X, Y, Z, A, B,C,D) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y, Z, A, B, C, D); RFWARNING(W_MSG);} #define WARN7(X, Y, Z,A,B,C,D,E) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y, Z, A, B, C, D, E); RFWARNING(W_MSG);} #endif RandomFieldsUtils/src/kleinkram.h0000644000176200001440000002521314227157055016572 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ // by 3.2.2021: xAx:: BLAS lohnt noch nicht // A^t A: BLAS lohnt sich ab aA = k x n, k >=8, n > MAXOWN #ifndef kleinkram_rfutils_h #define kleinkram_rfutils_h 1 #if defined OBSOLETE_RFU && !defined RFU_NEED_OBSOLETE // #warning OBSOLETE_RFU void strcopyN(char *dest, const char *src, int n); usr_bool UsrBool(SEXP p, char *name, int idx); usr_bool UsrBoolRelaxed(SEXP p, char *name, int idx); #define INT Integer(el, name, 0) #define LOGI Logical(el, name, 0) #define NUM Real(el, name, 0) #define USRLOG UsrBool(el, name, 0) #define USRLOGRELAXED UsrBoolRelaxed(el, name, 0) #define CHR Char(el, name) #define STR(X, N) strcopyN(X, CHAR(STRING_ELT(el, 0)), N); #define POS0INT NonNegInteger(el, name) /* better: non-negative */ #define POS0NUM NonNegReal(el, name) #define NEG0NUM NonPosReal(el, name) #define POSINT PositiveInteger(el, name) /* better: non-negative */ #define POSNUM PositiveReal(el, name) SEXP Logic(bool* V, int n, int max) ; SEXP Num(double* V, int n, int max) ; SEXP Int(int *V, int n, int max) ; SEXP Char(const char **V, int n, int max) ; SEXP Mat(double* V, int row, int col, int max); SEXP Mat_t(double* V, int row, int col, int max); SEXP MatInt(int* V, int row, int col, int max) ; SEXP MatString(char **V, int row, int col, int max); //SEXP Array3D(int** V, int depth, int row, int col, int max) ; SEXP String(char *V); SEXP Logic(bool* V, int n) ; SEXP Num(double* V, int n) ; SEXP Int(int *V, int n) ; SEXP Char(const char **V, int n) ; SEXP Mat(double* V, int row, int col); SEXP Mat_t(double* V, int row, int col); SEXP MatInt(int* V, int row, int col) ; SEXP MatString(char** V, int row, int col); //SEXP Array3D(int** V, int depth, int row, int col) ; SEXP String(char V[][MAXCHAR], int n, int max); SEXP String(int *V, const char * List[], int n, int endvalue); SEXP TooLarge(int *n, int l); SEXP TooSmall(); double Real(SEXP p, char *name, int idx); void Real(SEXP el, char *name, double *vec, int maxn) ; int Integer(SEXP p, char *name, int idx, bool nulltoNA) ; int Integer(SEXP p, char *name, int idx); void Integer(SEXP el, char *name, int *vec, int maxn) ; void Integer2(SEXP el, char *name, int *vec) ; bool Logical(SEXP p, char *name, int idx); char Char(SEXP el, char *name) ; double NonNegInteger(SEXP el, char *name) ; double NonNegReal(SEXP el, char *name) ; double NonPosReal(SEXP el, char *name) ; double PositiveInteger(SEXP el, char *name) ; double PositiveReal(SEXP el, char *name) ; void String(SEXP el, char *name, char names[][MAXCHAR], int maxlen); #define MULTIPLEMATCHING -2 #define NOMATCHING -1 #define MATCHESINTERNAL -3 int Match(char *name, const char * List[], int n); int Match(char *name, name_type List, int n); SEXP ExtendedInteger(double x); SEXP ExtendedBooleanUsr(usr_bool x); double XkCXtl(double *X, double *C, int nrow, int dim, int k, int l); void XCXt(double *X, double *C, double *V, int nrow, int dim); void AtA(double *a, int nrow, int ncol, double *A); void xA(double *x, double*A, int nrow, int ncol, double *y); void xA_noomp(double *x, double*A, int nrow, int ncol, double *y); void xA(double *x1, double *x2, double*A, int nrow, int ncol, double *y1, double *y2); void xAx(double *x, double*A, int nrow, double *y); void Ax(double *A, double*x, int nrow, int ncol, double *y); void Ax(double *A, double*x1, double*x2, int nrow, int ncol, double *y1, double *y2); double xUy(double *x, double *U, double *y, int dim); double xUxz(double *x, double *U, int dim, double *z); double x_UxPz(double *x, double *U, double *z, int dim); double xUx(double *x, double *U, int dim); void matmult(double *A, double *B, double *C, int l, int m, int n); void matmulttransposed(double *A, double *B, double *C, int m, int l, int n); void matmult_2ndtransp(double *A, double *B, double *C, int m, int l, int n); void matmult_tt(double *A, double *B, double *C, int m, int l, int n); double *matrixmult(double *m1, double *m2, int dim1, int dim2, int dim3); void GetName(SEXP el, char *name, const char * List[], int n, int defaultvalue, int endvalue, int *ans, int maxlen_ans); int GetName(SEXP el, char *name, const char * List[], int n) ; int GetName(SEXP el, char *name, const char * List[], int n, int defaultvalue) ; #define SCALAR_PROD(A, B, N, ANS) { \ int k_ =0, \ end_ = N - 4; \ ANS = 0.0; \ for (; k_ #include #include "RandomFieldsUtils.h" #include "kleinkram.h" #include "options.h" #include "Utils.h" #include "xport_import.h" #include "extern.h" #if defined AVX2 ASSERT_SIMD(avx2_fctns, avx2); #define algn_general(X) ((1U + (uintptr_t) (((uintptr_t) X - 1U) / BytesPerBlock)) * BytesPerBlock) #if defined SSE41 || defined AVX2 int static inline *algnInt(int *X) { assert(algn_general(X)>=(uintptr_t)X); return (int *) algn_general(X); } #endif void colMaxsIint256(int *M, Long r, Long c, int *ans) { if (r < 32 #if defined AVX2 || !avx2Avail #elif defined SSE41 || !sse41Avail #endif ) { for (int i=0; i #include "RandomFieldsUtils.h" #include "zzz_RandomFieldsUtils.h" #include "Utils.h" #include "xport_import.h" double struve_intern(double x, double nu, double factor_Sign, bool expscaled) { if ((x == 0.0) && (nu>-1.0)) return 0.0; if (x <= 0.0) return RF_NA; // not programmed yet double exp_dummy, dummy = 0.0, logx = 2.0 * LOG(0.5 * x), x1 = 1.5, x2 = nu + 1.5, value = 1.0, fsign = factor_Sign, epsilon=1e-20; do { dummy += logx - LOG(x1) - LOG(FABS(x2)); exp_dummy = EXP(dummy); value += (1 - 2 * (x2 < 0)) * fsign * exp_dummy; // printf("%10g %10g %10g %10g\n", value, fsign, x1, x2); x1 += 1.0; x2 += 1.0; fsign = factor_Sign * fsign; } while (exp_dummy > FABS(value) * epsilon); x1 = 1.5; x2 = nu + 1.5; if (x2 > 0.0) { dummy = (nu + 1.0) * 0.5 * logx - lgammafn(x1) - lgammafn(x2); if (expscaled) dummy -= x; value *= EXP(dummy); } else { //if ( (double) ((int) (x1-0.5)) != x1-0.5 ) return RF_NA; value *= POW(0.5 * x, nu + 1.0) / (gammafn(x1) * gammafn(x2)); if (expscaled) value *= EXP(-x); } return value; } //void Struv eH(double *x, double *nu) {*x=struv e(*x, *nu, -1.0, false);} //void Struv eL(double *x, double *nu, int * expScaled) { // *x=struv e(*x, *nu, 1.0, (bool) *expScaled); //} double StruveH(double x, double nu) {return struve_intern(x, nu, -1.0, false);} double StruveL(double x, double nu, bool expScaled) { return struve_intern(x, nu, 1.0, expScaled); } SEXP struve(SEXP X, SEXP Nu, SEXP Factor_Sign, SEXP Expscaled) { int i, lenx = length(X), lennu = length(Nu), len = lenx; if (len < lennu) len = lennu; SEXP Result; PROTECT(Result = allocVector(REALSXP, len)); double *x = REAL(X), *nu = REAL(Nu), factor_sign = REAL(Factor_Sign)[0], *result = REAL(Result); bool expscaled = LOGICAL(Expscaled)[0]; for (i=0; i MATERN_NU_THRES; double bk[MATERN_NU_THRES + 1U]; if (x > LOW_MATERN && nu < RF_INF) { if (x == RF_INF) return RF_NEGINF; if (simple) { if (nuThres != KT->nuOld) { KT->nuOld = nuThres; KT->loggamma_old = lgammafn(nuThres); } loggamma = KT->loggamma_old; } else { if (nu1 != KT->nu1old) { KT->nu1old = nu1; KT->loggamma1old = lgammafn(nu1); } if (nu2 != KT->nu2old) { KT->nu2old = nu2; KT->loggamma2old = lgammafn(nu2); } loggamma = 0.5 * (KT->loggamma1old + KT->loggamma2old); } y = x * scale; v = LOG2 + nuThres * LOG(0.5 * y) - loggamma + LOG(bessel_k_ex(y, nuThres, 2.0, bk)) - y; } else v = 0.0; if (nu > MATERN_NU_THRES) { // factor!=0.0 && double w, g = MATERN_NU_THRES / nu; y = x * factor / 2; w = logGauss(y); //if (nu>100) printf("nu=%10g %10e %10e %10e\n", nu, v, g, w); v = v * g + (1.0 - g) * w; if (nu1 != nu2) { // consistenz zw. nu1, nu2 und nuThres wiederherstellen v += lgammafn(nu)- 0.5 * (lgammafn(nu1) + lgammafn(nu2)); // !nuThres } // if (!R_FINITE(v)) ERR0("non-finite value in the whittle-matern model -- value of 'nu' is much too large"); //if (nu>100) printf("v=%10g \n", v); } return v; } double WM(double x, double nu, double factor) { // check calling functions, like hyperbolic and gneiting if any changings !! return EXP(logWM(x, nu, nu, factor)); } double DWM(double x, double nu, double factor) { KEY_type *KT = KEYT(); double y, v, nuThres = nu < MATERN_NU_THRES ? nu : MATERN_NU_THRES, scale = 1.0; if (factor != 0.0) scale = factor * SQRT(nuThres); double bk[MATERN_NU_THRES + 1U]; if (x > LOW_MATERN && nu < RF_INF) { if (x == RF_INF) return 0.0; if (nuThres!=KT->nuOld) { KT->nuOld = nuThres; KT->loggamma_old = lgammafn(nuThres); } y = x * scale; v = - 2.0 * EXP(nuThres * LOG(0.5 * y) - KT->loggamma_old + LOG(bessel_k_ex(y, nuThres - 1.0, 2.0, bk)) - y); } else { v = (nuThres > 0.5) ? 0.0 : (nuThres < 0.5) ? INFTY : 1.253314137; } v *= scale; if (nu > MATERN_NU_THRES) { double w, g = MATERN_NU_THRES / nu; scale = factor / 2.0; y = x * scale; w = DGauss(y) * scale; v = v * g + (1.0 - g) * w; } return v; } double DDWM(double x, double nu, double factor) { KEY_type *KT = KEYT(); double y, v, nuThres = nu < MATERN_NU_THRES ? nu : MATERN_NU_THRES, scale = 1.0; if (factor != 0.0) scale = factor * SQRT(nuThres); double scaleSq = scale * scale, bk[MATERN_NU_THRES + 1U]; if (x > LOW_MATERN && nu < RF_INF) { if (x == RF_INF) return 0.0; if (nuThres!=KT->nuOld) { KT->nuAlt = nuThres; KT->gamma = gammafn(nuThres); } y = x * scale; v = POW(0.5 * y , nuThres - 1.0) / KT->gamma * (- bessel_k_ex(y, nuThres - 1.0, 1.0, bk) + y * bessel_k_ex(y, nuThres - 2.0, 1.0, bk)); } else { v = (nu > 1.0) ? -0.5 / (nu - 1.0) : INFTY; } v *= scaleSq; if (nu > MATERN_NU_THRES) { double w, g = MATERN_NU_THRES / nu; scale = factor / 2.0; scaleSq = scale * scale; y = x * scale; w = DDGauss(y) * scaleSq; v = v * g + (1.0 - g) * w; } return v; } double D3WM(double x, double nu, double factor) { KEY_type *KT = KEYT(); double y, v, nuThres = nu < MATERN_NU_THRES ? nu : MATERN_NU_THRES, scale = (factor != 0.0) ? factor * SQRT(nuThres) : 1.0, scaleSq = scale * scale; double bk[MATERN_NU_THRES + 1U]; if (x > LOW_MATERN && nu < RF_INF) { if (x == RF_INF) return 0.0; if (nuThres!=KT->nuOld) { KT->nuAlt = nuThres; KT->gamma = gammafn(nuThres); } y = x * scale; v = POW(0.5 * y , nuThres - 1.0) / KT->gamma * ( 3.0 * bessel_k_ex(y, nuThres - 2.0, 1.0, bk) -y * bessel_k_ex(y, nuThres - 3.0, 1.0, bk)); } else { v = 0.0; } v *= scaleSq * scale; if (nu > MATERN_NU_THRES) { double w, g = MATERN_NU_THRES / nu; scale = factor / 2.0; scaleSq = scale * scale; y = x * scale; w = D3Gauss(y) * scaleSq * scale; v = v * g + (1.0 - g) * w; } return v; } double D4WM(double x, double nu, double factor) { KEY_type *KT = KEYT(); double y, v, nuThres = nu < MATERN_NU_THRES ? nu : MATERN_NU_THRES, scale = (factor != 0.0) ? factor * SQRT(nuThres) : 1.0, scaleSq = scale * scale; double bk[MATERN_NU_THRES + 1U]; // printf("x=%10g nu=%10g\n", x, nuThres); if (x > LOW_MATERN && nu < RF_INF) { if (x == RF_INF) return 0.0; if (nuThres!=KT->nuOld) { KT->nuAlt = nuThres; KT->gamma = gammafn(nuThres); } y = x * scale; v = 0.25 * POW(0.5 * y , nuThres - 3.0) / KT->gamma * (+ 6.0 * (nuThres - 3.0 - y * y) * bessel_k_ex(y, nuThres - 3.0, 1.0, bk) + y * (3.0 + y * y) * bessel_k_ex(y, nuThres - 4.0, 1.0, bk)); } else { v = INFTY; if (nuThres > 2.0) v = 0.75 / ((nuThres - 1.0) * (nuThres - 2.0)); } v *= scaleSq * scaleSq; if (nu > MATERN_NU_THRES) { double w, g = MATERN_NU_THRES / nu; scale = factor / 2.0; scaleSq = scale * scale; y = x * scale; w = D4Gauss(y) * scaleSq * scaleSq; v = v * g + (1.0 - g) * w; } // printf("v=%10g\n", v); return v; } typedef double (*primfct1)(double); typedef double (*primfct3)(double, double, double); #define CALCULATE(PRIMFCTN) \ double *x = REAL(X); \ int n = length(X), \ deriv = INTEGER(Derivative)[0]; \ if (deriv < 0 || deriv > 4) ERR0("value of 'derivative' out of range"); \ PRIMFCTN F = fctns[deriv]; \ \ SEXP Ans; \ PROTECT(Ans=allocVector(REALSXP, n)); \ double *ans = REAL(Ans); \ for (int i=0; i #include #include "kleinkram.h" #include "General_utils.h" #include "zzz_RandomFieldsUtils.h" #include "xport_import.h" const char // constants cannot be exported; *KKR_TYPE_NAMES[LAST_R_TYPE_NAME + 1] = { // never change ! see AutoRFU.cc "NILSXP" /* 0 */, "SYMSXP", "LISTSXP", "CLOSXP", "ENVSXP", "PROMSXP", "LANGSXP", "SPECIALSXP", "BUILTINSXP", "CHARSXP", "LGLSXP" /* 10 */, "??", "??", "INTSXP", "REALSXP", "CPLXSXP", "STRSXP", "DOTSXP", "ANYSXP", "ECSXP", "EXPRSXP" /*20 */, "BCODESXP", "EXTPTRSXP", "WEAKREFSXP", "RAWSXP", "S4SXP" /* 25 */, "", "", "", "", "NEWSXP" /* 30 */, "FREESXP", "??SXP"}; #define USE_OWN_ALG(SCALAR_LEN, PARALLEL) true #define USE_OWN_SCALAR_PROD true #define SCALAR(A,B,C) scalarX(A,B,C, SCALAR_AVX) void strcopyN(char *dest, const char *src, int n) { if (n > 1) { n--; strncpy(dest, src, n); } dest[n] = '\0'; } void AtA(double *a, Long nrow, Long ncol, double *C, int VARIABLE_IS_NOT_USED cores) { // C = A^T %*% A if (USE_OWN_ALG(nrow, ncol) || nrow * ncol > MAXINT) { #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) schedule(dynamic, 20) if (MULTIMINSIZE(ncol)) #endif for (Long i=0; i MAXINT) { double sum = 0.0; #ifdef DO_PARALLEL #pragma omp parallel for num_threads(cores) reduction(+:sum) schedule(static) if (MULTIMINSIZE(nrow) && MULTIMINSIZE(nrow)) #endif for (Long i=0; i 1000) #endif for (Long i=0; i 1000) #endif for (Long i=0; imax) return TooLarge(n); if (n<0) return TooSmall(); PROTECT(Ans=allocVector(INTSXP, (int) n)); MEMCOPY(INTEGER(Ans), V, n * sizeof(int)); UNPROTECT(1); return Ans; } SEXP Int(int* V, Long n) { return Int(V, n, n); } SEXP Logic(bool* V, Long n, Long max) { SEXP Ans; if (V==NULL) return allocVector(VECSXP, 0); if (n>max) return TooLarge(n); if (n<0) return TooSmall(); PROTECT(Ans=allocVector(LGLSXP, (int) n)); int *ans = LOGICAL(Ans); for (Long i=0; imax) return TooLarge(n); if (n<0) return TooSmall(); PROTECT(Ans=allocVector(REALSXP, (int) n)); MEMCOPY(REAL(Ans), V, n * sizeof(double)); UNPROTECT(1); return Ans; } SEXP Num(double* V, Long n) { return Num(V, n, n); } SEXP Char(const char **V, Long n, Long max) { SEXP Ans; if (V==NULL) return allocVector(STRSXP, 0); if (n>max) return TooLarge(n); if (n<0) return TooSmall(); PROTECT(Ans=allocVector(STRSXP, (int) n)); for (Long i=0; imax) return TooLarge(row, col); SEXP Ans; PROTECT(Ans=allocMatrix(REALSXP, (int) row, (int) col)); MEMCOPY(REAL(Ans), V, n * sizeof(double)); UNPROTECT(1); return Ans; } SEXP Mat(double* V, Long row, Long col) { return Mat(V, row, col, MAXINT); } SEXP Mat_t(double* V, Long row, Long col, Long max) { if (V==NULL) return allocMatrix(REALSXP, 0, 0); Long n = row * col; if (n>max) return TooLarge(row, col); SEXP Ans; PROTECT(Ans=allocMatrix(REALSXP, (int) row, (int) col)); double *ans = REAL(Ans); for (Long k=0, j=0; jmax) return TooLarge(row, col); SEXP Ans; PROTECT(Ans=allocMatrix(STRSXP, (int) row, (int) col)); for (Long k=0; kmax) return TooLarge(row, col); SEXP Ans; PROTECT(Ans=allocMatrix(INTSXP, (int) row, (int) col)); MEMCOPY(INTEGER(Ans), V, n * sizeof(int)); UNPROTECT(1); return Ans; } SEXP MatInt(int* V, Long row, Long col) { return MatInt(V, row, col, MAXINT); } SEXP Array3D(double** V, Long depth, Long row, Long col, Long max) { if (V==NULL) return alloc3DArray(REALSXP, 0, 0, 0); Long m = row * col, n = row * col * depth; if (n>max) { int nn[3] = { (int) row, (int) col, (int) depth }; return TooLarge(nn, 3); } SEXP Ans; PROTECT(Ans=alloc3DArray(REALSXP, (int) depth, (int) row, (int) col)); double *ans = REAL(Ans); for (Long j=0; jmax) return TooLarge(n); if (n<0) return TooSmall(); PROTECT(str = allocVector(STRSXP, (int) n)); for (Long i=0; i= n) j=0; } return; } int Integer(SEXP p, char *name, Long idx, bool nulltoNA) { //printf("integer %s %d %d len=%d\n", name, idx, nulltoNA, length(p)); if (p != R_NilValue) { assert(idx < length(p)); switch(TYPEOF(p)) { case INTSXP : return INTEGER(p)[idx]; case REALSXP : double value; value = REAL(p)[idx]; if (ISNAN(value)) { return NA_INTEGER; } int intvalue; intvalue = (int) value; if (value == intvalue) return intvalue; else { RFERROR2("%.50s: integer value expected. Got %10e.", name, value); } case LGLSXP : if (LOGICAL(p)[idx]==NA_LOGICAL) return(NA_INTEGER); else return((int) LOGICAL(p)[idx]); default : {} } } else if (nulltoNA) return NA_INTEGER; RFERROR2("%.50s: incorrect type. Got '%.50s'.", name, TYPEOF(p) <= LAST_R_TYPE_NAME ? KKR_TYPE_NAMES[TYPEOF(p)] : "something else"); return NA_INTEGER; // compiler warning vermeiden } int Integer(SEXP p, char *name, Long idx) { return Integer(p, name, idx, false); } void Integer(SEXP el, char *name, int *vec, Long maxn) { if (el == R_NilValue) { RFERROR1("'%.50s' cannot be transformed to integer.\n",name); } Long n = length(el); for (Long j=0, i=0; i= n) j=0; } } void Integer2(SEXP el, char *name, int *vec) { Long n = length(el); if (n == 0) RFERROR1("'%.50s' cannot be transformed to integer.\n",name); vec[0] = Integer(el, name, 0); if (vec[0] != NA_INTEGER && vec[0] < 1) RFERROR1("first component of '%.50s' must be at least 1", name); if (n == 1) vec[1] = vec[0]; else { vec[1] = Integer(el, name, n-1); if ( vec[1] != NA_INTEGER && vec[1] < vec[0]) RFERROR1("'%.50s' must be increasing", name); if (n > 2) { vec[1] = vec[0]; for (Long i = 1; i maxlen) { RFERROR1("number of variable names exceeds %d. Take abbreviations?", (int) maxlen); } type = TYPEOF(el); if (type == CHARSXP) { for (Long i=0; i0.0) { num=0.0; WARN1("%.50s, which has been positive, is set 0.\n",name); } return num; } int PositiveInteger(SEXP el, char *name) { int num = INT; if (num <= 0) { WARN2("'%.50s', which has been %.50s, is set 1.\n", name, num ? "negative" : "0"); num=1; } return num; } double PositiveReal(SEXP el, char *name) { double num = NUM; if (num<=0.0) { WARN2("'%.50s', which has been %.50s, is set 1.\n", name, num==0.0 ? "0" : "negative"); num=1.0; } return num; } SEXP ExtendedInteger(double x) { return ScalarInteger(R_FINITE(x) ? x : NA_INTEGER); } SEXP ExtendedBooleanUsr(usr_bool x) { return ScalarLogical((int) x); } int Match(char *name, name_type List, int n) { // == NOMATCHING, -1, if no matching function is found // == MULTIPLEMATCHING,-2, if multiple matching fctns are found, // if more than one match exactly, the last one is taken (enables overwriting // standard functions) Ulong ln = STRLEN(name); int Nr=0; while ( Nr < n && STRNCMP(name, List[Nr], ln)) Nr++; if (Nr < n) { if (ln==STRLEN(List[Nr])) // exactmatching -- take first -- changed 1/7/07 return Nr; // a matching function is found. Are there other functions that match? int j; bool multiplematching=false; j=Nr+1; // if two or more covariance functions have the same name // the last one is taken while (j maxlen_ans) RFERROR2("option '%.50s' is too lengthy. Maximum length is %d.", name, maxlen_ans); if (TYPEOF(el) == STRSXP) { for (; k= n) goto ErrorHandling0; } for (k=len_el; k= 0) { ans[0] = defaultvalue; for (k=1; k 0) { PRINTF("options starting with prefix '%.50s' have been already attached.", PKGprefixlist[0]); } return; } } if (basicopt) option_class_list[noption_class_list++] = PKGprefixlist[0]; if (NList >= MAXNLIST) BUG; strcopyN(pkgnames[NList], pkgname, PKGNAMELENGTH); Allprefix[NList] = PKGprefixlist; AllprefixN[NList] = N; Allall[NList] = PKGall; AllallN[NList] = PKGallN; setoption_fct_list[NList][NList] = NULL; getoption_fct_list[NList][NList] = NULL; finaloption_fct_list[NList] = NULL; deloption_fct_list[NList] = NULL; setparam[NList] = set; finalparam[NList] = final; getparam[NList] = get; delparam[NList] = del; min_simd_needs[NList] = min_gpu_needs[NList] = Inone; NList++; PLoffset = pl_offset; PL = OPTIONS.basic.Cprintlevel = OPTIONS.basic.Rprintlevel + PLoffset; CORES = OPTIONS.basic.cores; } void detachRFoptions(const char **PKGprefixlist, int N) { detachRFUoptions(PKGprefixlist, N); } void getUtilsParam(utilsoption_type **global) { *global = &OPTIONS; // OK! } bool is_positive_definite( double * C, int dim) { return Is_positive_definite( C, dim, 1);} double detPosDef(double * M, int size) { return DetPosDef( M, size, 1) ;} int invertMatrix(double * M, int size) { return InvertMatrix( M, size, 1);} double detPosDefsp(double * M, int size, solve_options * sp) { return DetPosDefsp( M, size, sp, 1) ;} int XCinvXdet(double* M, int size, double *X, int X_cols, double * XCinvX, double * det, bool log, solve_storage *PT) { return xCinvXdet( M, size, X, X_cols,XCinvX, det, log, PT, 1) ;} int XCinvYdet(double* M, int size, bool posdef, double * X, double * Y, int cols, double * XCinvY, double * det, bool log, solve_storage *PT) { return xCinvYdet( M, size, posdef, X, Y, cols,XCinvY, det, log, PT, 1) ;} int chol(double * MPT, int size) { return cholesky( MPT, size, 1) ;} int solvePosDef(double* M, int size, bool posdef, double * rhs, int rhs_cols, double * logdet, solve_storage * PT) { return SolvePosDef( M, size, posdef, rhs, rhs_cols, logdet, PT, 1);} int solvePosDefSp(double * M, int size, bool posdef, double * rhs, int rhs_cols, double *logdet, solve_storage * Pt, solve_options *sp) { return SolvePosDefSp( M, size, posdef,rhs, rhs_cols, logdet, Pt, sp, 1);} int sqrtPosDefFree(double * M, int size, solve_storage * pt, solve_options * sp) { return SqrtPosDefFree( M, size, pt,sp, 1) ;} #ifdef __cplusplus } #endif RandomFieldsUtils/src/solve.cc0000644000176200001440000021716314227157055016112 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 - 2017 Martin Schlather, Reinhard Furrer, Martin Kroll Copyright (C) 2017 - 2020 Martin Schlather Copyright (C) 2021 - 2022 Martin Schlather, Alexander Freudenberg Copyright (C) 2023 -- 2024 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifdef DO_PARALLEL #include #endif #include "Basic_utils_local.h" #include #if ! defined FCONE // only as RFU has errror that R V 3x is used #if defined __GCC__ #warning FCONE not available #endif #define FCONE #endif #define LOCAL_ERRORSTRING #define WHICH_ERRORSTRING pt->err_msg #include "RandomFieldsUtils.h" #include "zzz_RandomFieldsUtils.h" #include "kleinkram.h" #include "xport_import.h" #include "extern.h" extern const char * solve[solveN]; #define SCALAR(A,B,C) scalarX(A,B,C,NR) #define LINEAR(A,B,C,D) linearX(A,B,C,D, linear_avx) #ifdef USEGPU #include "solve_gpu.h" #else SIMD_MISS(solve_61, gpu); #endif AVAILABLE_SIMD const char * InversionNames[nr_InversionMethods] = { "cholesky", "svd", "eigen", "sparse", "method undefined", "qr", "lu", "no method left", "GPU-cholesky", "R chol implementation", "direct formula", "diagonal"}; #define KAHAN OPTIONS.installNrun.kahanCorrection #define CMALLOC(WHICH, N, TYPE) { \ Long _N_ = N; \ if (pt->n_##WHICH < _N_) { \ if (pt->n_##WHICH < 0) BUG; \ FREE(pt->WHICH); \ pt->n_##WHICH = _N_; \ if ((pt->WHICH = (TYPE *) CALLOC(_N_, sizeof(TYPE))) == NULL) \ return ERRORMEMORYALLOCATION; \ } else { \ assert( (_N_ > 0 && pt->WHICH != NULL) || _N_ == 0); \ for (Long iii=0; iii<_N_; pt->WHICH[iii++] = 0); \ } \ } \ TYPE VARIABLE_IS_NOT_USED *WHICH = pt->WHICH /* // sqrtPosDef nutzt pt->U fuer das Ergebnis #define FREEING_RESULT(WHICH) \ assert(int VARIABLE_IS_UNUSED *_i = WHICH); \ if (pt->WHICH != NULL && pt->WHICH != result) { \ UNCONDFREE(pt->WHICH); \ pt->n_##WHICH = 0; \ } */ double Determinant(double *M, int size, bool log) { Long sizeP1 = size + 1, sizeSq = (Long) size * size; if (log) { double tmp = 0.0; for (Long i=0; ito_be_deleted); } void solve_DELETE(solve_storage **S) { solve_storage *x = *S; if (x!=NULL) { solve_DELETE0(*S); UNCONDFREE(*S); } } void solve_NULL(solve_storage* x) { if (x == NULL) return; MEMSET(x, 0, sizeof(solve_storage)); x->nsuper = x->size = -1; x->method = NoInversionMethod; for (int i=0; inewMethods[i++] = NoInversionMethod); x->actual_pivot = PIVOT_UNDEFINED; } double static inline det3(double *M, int size) { double det; switch(size){ // Abfrage nach Groesse der Matrix M + Berechnung der Determinante per Hand case 1: det = M[0]; break; case 2: det = M[0] * M[3] - M[1] * M[2]; break; case 3: det = M[0] * (M[4] * M[8] - M[5] * M[7]) - M[1] * (M[3] * M[8] - M[5] * M[6]) + M[2] * (M[3] * M[7] - M[4] * M[6]); // Entwicklung nach 1. Spalte break; default : BUG; break; } return det; } int logdet3(double det, bool posdef, double *logdet, bool log) { if (posdef && det < 0) return ERRORFAILED; if (logdet != NULL) { if (log) { if (det <= 0) return ERRORFAILED; *logdet = LOG(det); } else *logdet = det; } return NOERROR; } int solve3(double *M, int size, bool posdef, double *rhs, int rhs_cols, double *result, double *logdet, bool log, solve_storage *pt ){ assert(size <= 3); if (size <= 0) SERR0("matrix in 'solvePosDef' of non-positive size."); double det = det3(M, size); if (logdet3(det, posdef, logdet, log) != NOERROR) return ERRORFAILED; double detinv = 1.0 / det; // determinant of inverse of M switch(size){ case 1 : {// size of matrix == 1 if (rhs_cols == 0) result[0] = detinv; else for (int i=0; i 0.0 ? M[size] / res[0] : 0.0; double dummy = M[size + 1] - res[size] * res[size]; res[size + 1] = SQRT(MAX(0.0, dummy)); if (size == 2) return NOERROR; res[2] = res[5] = 0.0; res[6] = res[0] > 0.0 ? M[6] / res[0] : 0.0; res[7] = res[4] > 0.0 ? (M[7] - res[3] * res[6]) / res[4] : 0.0; dummy = M[8] - res[6] * res[6] - res[7] * res[7]; res[8] = SQRT(MAX(0.0, dummy)); return NOERROR; } void Sort(double *RESULT, Long size, Long rhs_cols, int *pi, int *rank, double *dummy) { if (size > MAXINT) BUG; orderingInt(pi, (int) size, 1, rank); Long i=0, totalRHS = (Long) size * rhs_cols; while(i < size && i == rank[i]) i++; while (i < size) { Long stored_i = i, read = i; double *p_write = NULL, *p_read = RESULT + read; for (Long k=0; k 60) schedule(dynamic, 20) #endif for (Long k=0; k 60) schedule(dynamic, 20) #endif for (Long k=0; kk; i--) { double *pM = MPT + i * size, r = (p_RESULT[i] /= pM[i]); diagonal[k] -= r *pM[k]; LINEAR(pM + k + 1, -r, i-k-1, p_RESULT + k + 1); // for (int j=k+1; j MAXINT) BUG; assert(sp != NULL); assert(NA_LOGICAL == INT_MIN && NA_LOGICAL == NA_INTEGER); // nur zur sicherheit, wegen usr_bool // eigentlich sollte usr_bool unabhaengig davon funktionieren assert(calculate != DETERMINANT || (logdet != NULL && result == NULL && rhs0 == NULL)); assert(calculate != MATRIXSQRT || (rhs0 == NULL && posdef)); assert((rhs_cols != 0) xor (rhs0 == NULL)); double *RESULT = result != NULL ? result : rhs_cols > 0 ? rhs0 : M0; // Pivot_Cholesky: // if (MPT == Morig || (rhs_cols > 0 && rhs == RESULT)) // CERR0("Pivoted cholesky cannot be performed on place! Either you are a programmer or you should contact the maintainer."); // !MATRIXSQRT && rhs_cols > 0 // assert(rhs != RESULT); la_modes la_mode = OPTIONS.installNrun.la_mode; if (size <= sp->tinysize) { if (Pt != NULL) { Pt->method = direct_formula; Pt->size = size; } if (calculate == DETERMINANT) return logdet3(det3(M0, size), posdef, logdet, sp->det_as_log); else if (calculate == MATRIXSQRT) return chol3(M0, size, RESULT, Pt); else return solve3(M0, size, posdef, rhs0, (int) rhs_cols, RESULT, logdet, sp->det_as_log, Pt); } // printf("size=%d %d %d sparse=%d\n", size, sp->pivot, PIVOT_AUTO, sp->sparse); // ERR0("XXXX"); // printf("%d %d %d\n", PIVOT_AUTO, direct_formula,Pt->method); assert(SOLVE_METHODS >= 2); // printf("A\n"); // printf("A1\n"); int err = NOERROR, spam_zaehler = 0, nnzA = 0; Long sizeSq = (Long) size * size, sizeRHS = (Long) size * rhs_cols; int sizeP1 = size + 1; // printf("A1dd\n"); usr_bool sparse = sp->sparse; // printf("A1xx\n"); // printf("%e\n", sp->spam_tol); double spam_tol = sp->spam_tol; // printf("A2\n"); bool diag = false, useGPU = la_mode == LA_GPU && (calculate == SOLVE || calculate == DETERMINANT); // printf("A3\n"); // printf("A %d %d size=%d %d %d \n", sparse,Nan ,size, useGPU, sp->n_spam_min[useGPU]); if (sparse == Nan && (sparse = (usr_bool) (size > sp->spam_min_n[useGPU]))) { // printf("AB2\n"); double mean_diag = 0.0; for (Long i=0; i= sp->spam_sample_n * 3; if (random_sample) { // printf("A2E\n"); double thr = sp->spam_sample_n * (1.0 - sp->spam_min_p[useGPU]); Long threshold = (Long) (thr + SQRT(thr) * 3), notZero = 0; for (Long i=0; ispam_sample_n; i++) { // printf("A2 %d %d\n", i , sp->spam_sample_n); if ((notZero += !(FABS(M0[(i * sp->spam_factor) % sizeSq]) <= spam_tol)) >= threshold){ sparse = False; break; } } if (PL >= PL_FCTN_DETAILS) { PRINTF("random sampling: sparse=%d\n", sparse == Nan ? NA_INTEGER : (int) sparse); } } /// printf("A2EX %d %d\n", random_sample, sparse == True); if (!random_sample || sparse == True) { // printf("AdC2\n"); Long diag_nnzA = 0; //#ifdef DO_PARALLEL //#pragma omp parallel for num_threads(CORES) schedule(dynamic,10) reduction(+:nnzA,diag_nnzA) //#endif for (Long i=0; i spam_tol // bei NA/NaN for (j=i * size; jspam_min_p[useGPU])); spam_zaehler = nnzA + 1; // printf("ddAdC2\n"); if (PL >= PL_DETAILSUSER) { if (diag) { PRINTF("diagonal matrix detected\n"); } else if (sparse == True) { PRINTF("sparse matrix detected (%3.2f%% zeros)\n", 100.0 * (1.0 - nnzA / (double) sizeSq)); } else { PRINTF("full matrix detected (%3.2f%% nonzeros)\n", 100.0 * nnzA / (double) sizeSq); } } } } else { // printf("ttAdC2\n"); diag = true; for (Long i=0; inewMethods; pt->method = NoFurtherInversionMethod; pt->size = size; // printf("BA\n"); if (diag) { pt->method = Diagonal; if (PL>=PL_STRUCTURE) { PRINTF("dealing with diagonal matrix\n"); } if (logdet != NULL) { *logdet = Determinant(M0, size, sp->det_as_log); if (calculate == DETERMINANT) { err = NOERROR; goto ErrorHandling; } } if (rhs_cols == 0) { MEMCOPY(RESULT, M0, sizeSq * sizeof(double)); if (calculate == MATRIXSQRT) { for (Long i=0; i 0.0 ? SQRT(M0[i]) : 0.0; } } else { for (Long i=0; iMethods[from] != NoFurtherInversionMethod && sp->Methods[from] != NoInversionMethod) { if (sp->Methods[from] == Sparse && sparse == True) from++; else Meth[to++] = sp->Methods[from++]; } // printf("from = %d (%d %d) [%d %d %d] sparse=%d %d\n", from, sp->Methods[0], sp->Methods[1], Meth[0], Meth[1], Meth[2], sparse == True, Sparse); if (from == 0) { // user did not give any method if (posdef) { if (to < SOLVE_METHODS) { Meth[to++] = useGPU ? GPUcholesky : Cholesky; if (to < SOLVE_METHODS) { Meth[to++] = sp->pivot_mode != PIVOT_NONE && useGPU ? Cholesky : Eigen; } } } else { Meth[to++] = LU; } } else { if (useGPU) for (int i=0; i 0 || (SOLVE_METHODS > first_not_reading_M0 + 1 && Meth[first_not_reading_M0 + 1] != Meth[first_not_reading_M0] && Meth[first_not_reading_M0 + 1] != NoFurtherInversionMethod) || (Meth[first_not_reading_M0] == SVD && sp->svd_tol > 0.0 && calculate != SOLVE) ) { // at least two different Methods in the list CMALLOC(main, sizeSq, double); // to pt->main, main local variable MPT = pt->main; if (rhs_cols > 0) { CMALLOC(rhs, sizeRHS, double); // to pt->main, main local variable RHS = pt->rhs; } } } } // printf("AFF\n"); errorstring_type ErrStr; STRCPY(ErrStr, ""); // printf("Meth=%d %d Chol=%d %d posdef=%d\n", Meth[0], Meth[1], Cholesky, SOLVE_METHODS, posdef); // for (int i=0; ipivot_mode; for (int m=0; mmethod = Meth[m]; // extern bool obsolete_package_in_use; //if (ob solete_package_in_use) // printf("m=%d %d %s: %d size=%d %d la_mode=%d\n", m, pt->method, InversionNames[pt->method], Cholesky ,size , OPTIONS.basic.LaMaxTakeIntern, la_mode); if (pt->method == Cholesky && size > OPTIONS.installNrun.LaMaxTakeIntern) { pt->method = calculate == DETERMINANT ? LU : Rcholesky; } if (pt->method < 0) break; if (calculate != SOLVE) { if (pt->method == NoInversionMethod && m<=sparse) BUG; if (pt->method == NoFurtherInversionMethod) break; if (PL>=PL_STRUCTURE) { PRINTF("method to calculate the square root : %s\n", InversionNames[pt->method]); } } else { if (PL>=PL_STRUCTURE) { PRINTF("method to calculate the inverse : %s\n", InversionNames[pt->method]); } } if (MPT != M0 && m >= first_not_reading_M0) MEMCOPY(MPT, M0, sizeSq * sizeof(double)); if (RHS != rhs0) MEMCOPY(RHS, rhs0, (Long) sizeRHS * sizeof(double)); switch(pt->method) { case GPUcholesky : if (size > 28000) GERR0("due to a bug at NVIDIA the maximum size is currently limited to 28000."); if (!posdef) CERR0("Cholesky needs positive definite matrix"); #ifdef USEGPU if (proposed_pivot > PIVOT_AUTO) GERR0("cholesky decomposition on GPU does not allow for pivoting"); pt->actual_pivot = PIVOT_NONE; { double LD, *LogDet = logdet == NULL ? &LD : logdet; err = // int; see errors_messages.h for values 0...4 cholGPU(true, // bool : in : says that values must be copied // so in miraculix, Ext_AlexChol(false, ....) // can be called M0,// in: this matrix is copied by Alex because // of value 'true' in the first argument, // so contents never distroyed by Alex size, // in: size of the matrix rhs0, //in: if NULL the inverse of M is calculated; // rhs is copied by Alex because of value 'true' // in the first argument,, so never distroyed by Alex rhs_cols, // in: number of columns on the right hand side LogDet, // out : logarithm of the determinant of // the sqare root(!) of the matrix M RESULT); // out: a pointer to the result whether or // not rhs is given if (err != NOERROR) { if (proposed_pivot == PIVOT_AUTO) proposed_pivot = PIVOT_DO; continue; } if (logdet != NULL) { *logdet *= 2; if (!sp->det_as_log) *logdet = EXP(*logdet); if (calculate == DETERMINANT) { err = NOERROR; goto ErrorHandling; } } } #else // err = NOERROR; BUG; #endif break; case Rcholesky : { // printf("chol (R)\n"); if (calculate == SOLVE) { if (rhs_cols > MAXINT) BUG; int n_rhs = (int) rhs_cols; double *m = NULL; CMALLOC(xja, size, int); if (rhs_cols == 0) { // printf("hier %ld %ld %ld res=%ld %ld %ld\n", RESULT, MPT, M0, result, rhs0, RHS); if (RESULT == MPT){ Long bytes = sizeSq * sizeof(double); m = (double *) MALLOC(bytes); MEMCOPY(m, M0, bytes); } MEMSET(RESULT, 0, sizeof(double) * sizeSq); for (Long i=0; i MAXINT) BUG; n_rhs = size; /* for (Long i=0; idet_as_log, xja); FREE(m); // NOTE: return / errrors only afterwards! if (err != NOERROR) GERR0("LU algorithm failed."); } else if (calculate == MATRIXSQRT) { if (MPT != RESULT) MEMCOPY(RESULT, MPT, sizeSq * sizeof(double)); F77dpotrf("U", &size, RESULT, &size, &err #ifdef USE_FC_LEN_T FCONE #endif ); if (logdet != NULL) { Determinant(RESULT, size, sp->det_as_log); if (sp->det_as_log) *logdet *=2; else *logdet *= *logdet; } Long sizeM1 = size - 1; for (Long i=0; i sp->max_chol) { CERR2("Matrix is too large for Cholesky decomposition. Maximum ist currently a %d x %d matrix. Increase 'max_chol' in 'RFoption' if necessary.", sp->max_chol, sp->max_chol); } /// printf("size = %d %d %d\n", size, rhs_cols, size > rhs_cols ? size : rhs_cols); CMALLOC(D, size > rhs_cols ? size : rhs_cols, double); for (Long i=0; iactual_pivot = PIVOT_UNDEFINED; if (proposed_pivot == PIVOT_NONE || proposed_pivot == PIVOT_AUTO) { // cmp for instance http://stackoverflow.com/questions/22479258/cholesky-decomposition-with-openmp // obere und untere dreiecksmatrix wird gelesen und obere geschrieben err = NOERROR; pt->actual_pivot = PIVOT_NONE; { double *A = MPT; for (Long i=0; idet_as_log); if (sp->det_as_log) *logdet *=2; else *logdet *= *logdet; if (calculate == DETERMINANT) { err = NOERROR; goto ErrorHandling; } } if (rhs_cols == 0) chol2inv(MPT, size); else { // rhs_cols > 0 //Long totalRHS = size * rhs_cols; //if (result!=NULL) MEMCOPY(RESULT, rhs, sizeof(double)*totalRHS); #ifdef DO_PARALLEL #pragma omp parallel for num_threads(CORES) if (rhs_cols > 30) schedule(static) #endif for (Long k=0; k 30) schedule(static) #endif for (Long k=0; k=0; i--) { double *pM = MPT + i * size, r = (p_RESULT[i] /= pM[i]); LINEAR(pM, -r, i, p_RESULT); // for (Long j=0; j PL_DETAILS) { PRINTF("trying pivoting\n"); } int actual_size = NA_INTEGER; // code according to Helmut Harbrecht,Michael Peters,Reinhold Schneider // talk: The pivoted Cholesky decomposition and its application to // stochastic PDEs // ebenso: untere dreiecksmatrix wird gelesen; obere geschrieben if (pt->actual_pivot == PIVOT_NONE) { // wiederherstellung der Diagonalen und der unteren dreiecksmatrix for (Long i=0; ipivot_idx); // ALWAYS FREE IT!!! cp Chol(SEXP M) pt->pivot_idx = (int*) MALLOC((Long) size * sizeof(int)); pt->n_pivot_idx = size; pt->actual_pivot = PIVOT_DO; for (int i=0; ipivot_idx[i] = i; pt->actual_size = actual_size = size; pi = pt->pivot_idx; } else { // PIVOT_IDX if (sp->n_pivot_idx < size || sp->actual_size > size) { // printf("XA, %d %d %d\n", sp->n_pivot_idx , size, sp->actual_size); CERR0("pivot_idx does not have the correct length.\nSet 'RFoption(pivot_idx=, pivot_actual_size=)' to the attributes of a\npivoted Cholesky decomposition."); } actual_size = pt->actual_size = sp->actual_size; if (actual_size > size) BUG; FREE(pt->pivot_idx); Long bytes = (Long) size * sizeof(int); pt->pivot_idx = (int*) MALLOC(bytes); MEMCOPY(pt->pivot_idx, sp->pivot_idx, bytes); pt->actual_pivot = PIVOT_IDX; pt->n_pivot_idx = sp->n_pivot_idx; pi = sp->pivot_idx; } err = NOERROR; // printf("hier\n"); double // *rhs = rhs, // *M00 = M0, rel_thres = 0, max_deviation = sp->max_deviation, // 1e-10, max_reldeviation = sp->max_reldeviation; // 1e-10, //printf("MTP %d %d %d\n", MPT == M0, rhs_cols, RHS == RESULT); if (MPT == M0 || (rhs_cols > 0 && RHS == RESULT)) CERR0("Pivoted cholesky cannot be performed on place! Either you are a programmer or you should contact the maintainer."); /* if (MPT == M0) { CMALLOC(main, sizeSq, double); MEMCOPY(main, M0, sizeSq * sizeof(double)); M00 = main; } if (rhs_cols > 0 && rhs == RESULT) { Long totalRHS = (Long) size * rhs_cols; CMALLOC(U, totalRHS, double); MEMCOPY(U, rhs, totalRHS * sizeof(double)); RHS = U; }*/ for (Long q=0; qactual_pivot == PIVOT_DO) { double max = RF_NEGINF, deviation = 0.0; Long k, argmax = NA_INTEGER; for (k=q; kpivot_relerror* sizeSq){ C_GERR1("matrix not positive definite or increase 'pivot_relerror' by at least factor %10g.", dummy * -1e4 / sizeSq, ERR_CHOL); } deviation += dummy; if (max < dummy) { max = dummy; argmax = k; } } double dev = rel_thres * max_reldeviation; if (deviation <= max_deviation || (q > 0 && deviation <= dev) ) { if (q > MAXINT) BUG; actual_size = pt->actual_size = (int) q; if (sp->pivot_check != False) { double largest = 0; for (Long i=q; i largest ? absm : largest; // if(absm == largest || absm > 5) printf("%10e %d %d; %d\n", absm, i, j, size); } } if (largest > max_deviation || (q > 0 && largest > dev)) { char msg[500]; SPRINTF(msg, "Matrix has a numerically zero, leading value at the %d-th pivoted line, but the largest deviation %10e from zero in the rest of the matrix is greater than the tolerance %10e. %.50s.", (int) q, largest, MAX(max_deviation, dev), sp->pivot_check == True ? "If you are sure that the matrix is semi-definite, set 'RFoptions(pivot_check=NA)' or 'RFoptions(pivot_check=True)'" : "The result can be imprecise"); if (sp->pivot_check == True) C_GERR0(msg, ERR_CHOL) else WARN0(msg); } } break; } rel_thres += D[pi[q]]; int dummy = pi[q]; pi[q] = pi[argmax]; pi[argmax] = dummy; } Long pqq = pi[q], col_q = pqq * size; if (D[pqq] < 0) { C_GERR1("Negative leading value found at the %d-th pivoted line.", (int) q, ERR_CHOL); } double lqpq = MPT[q + col_q] = SQRT(D[pqq]); #ifdef DO_PARALLEL #pragma omp parallel for num_threads(CORES) if (MULTIMINSIZE(size - q)) schedule(dynamic, 8) #endif for (Long i=q+1; iactual_pivot == PIVOT_DO) { N = actual_size; } if (sp->det_as_log) { if (N < size && !sp->pivot_partialdet) *logdet = RF_NEGINF; else { double logD = 0.0; for (Long i=0; i < N; i++) logD += LOG(MPT[i + pi[i] * (Long) size]); *logdet = logD * 2; } } else { if (N < size && !sp->pivot_partialdet) *logdet = 0; else { double logD = 1.0; for (Long i=0; i < N; i++) logD *= MPT[i + pi[i] * (Long) size]; *logdet = logD * logD; } } if (calculate == DETERMINANT) { err = NOERROR; goto ErrorHandling; } } ////////////////////////////////////////////////// ////////////////////////////////////////////////// if (rhs_cols == 0) { if (actual_size < size) GERR0("Matrix not definite. Try ") #ifdef DO_PARALLEL #pragma omp parallel for num_threads(CORES) if (size > 60) schedule(dynamic, 20) #endif for (Long k=0 ; k 60) schedule(dynamic, 20) #endif for (Long k=0; kk; i--) { double *pM = MPT + pi[i] * (Long) size, r = (p_RESULT[i] /= pM[i]); D[k] -= r * pM[k]; LINEAR(pM + k + 1, -r, i-k-1, p_RESULT + k + 1); // for (Long j=k+1; j 0 // if (rhs0 == RESULT) { // /* crash(); */ // #pragma GCC diagnostic push //#pragma GCC diagnostic ignored "-Wuninitialized" // Long i; PRINTF("%d\n", i);char m[1];m[i] = m[i-9] + 4; if (m[0]) i++; else i--; PRINTF("%s\n", m); // not MEMCOPY //#pragma GCC diagnostic pop // int *x = (int*) MALLOC(1000000); f ree(x); f ree(x); x[100] = 100; // } // printf("%ld %ld %ld\n", rhs0 , RESULT, RHS); //assert(rhs0 != RESULT); // assert(RHS != RESULT); double eps = D[0] * sp->pivot_relerror; #ifdef DO_PARALLEL #pragma omp parallel for num_threads(CORES) if (rhs_cols > 30) schedule(static) #endif for (Long k=0; k eps) { if (Pt == NULL) solve_DELETE(&pt); #ifdef DO_PARALLEL RFERROR("Equation system not solvable"); // RFERROR necessary. #else GERR1("Equation system not solvable (difference %10e). Try increasing 'pivot_relerror' in 'RFoptions' to get an approximate solution.", p_rhs[pii] - SCALAR(pM, p_RESULT, i)); #endif } } } #ifdef DO_PARALLEL #pragma omp parallel for num_threads(CORES) schedule(static) if (rhs_cols > 30) #endif for (Long k=0; k=0; i--) { Long pii = pi[i]; double *pM = MPT + pii * size, r = (p_RESULT[i] /= pM[i]); LINEAR(pM, -r, i, p_RESULT); // for (Long j=0; j 0 } // not sqrt only } // err == NOERROR } ERR_CHOL: if (err != NOERROR) { if (pt->actual_pivot == PIVOT_NONE) CERR2("Probably matrix not positive definite: %.300s. Consider 'RFoptions(%.80s)'.\n", ErrStr, calculate != SOLVE || rhs_cols > 0 ? "'pivot=PIVOT_AUTO)' or 'RFoptions(pivot=PIVOT_DO" : "solve_method=\"eigen\", solve.pseudoinverse=TRUE") // OK else // pt->actual_pivot == PIVOT_DO or PIVOT_IDX CERR1("Likely, the matrix is not positive semi definite: %.300s. Consider 'RFoptions(solve_method=\"svn\"\n", ErrStr) } if (PL >= PL_DETAILSUSER) { PRINTF("Cholesky decomposition successful\n"); } } break; case QR : {// QR returns transposed of the inverse !! if (rhs_cols > 0 || logdet != NULL || calculate != SOLVE) { err = ERRORFAILED; continue; } err = ERRORNOTPROGRAMMEDYET; /// to do: clarify transposed ! continue; CMALLOC(w2, size, double); CMALLOC(w3, size, double); F77dgeqrf(&size, &size, // QR MPT, &size, // aijmax, &irank, inc, w2, w3, w2, &size, &err); assert(false); // code zur invertierung fehlt hier! if (err != NOERROR) { CERR1("'dgeqrf' failed with err=%d.", err); } if (PL >= PL_DETAILSUSER) { PRINTF("QR successful\n"); } break; } case Eigen : { // M = U D UT int max_eigen = sp->max_svd; double eigen2zero = sp->eigen2zero; if (size > max_eigen) CERR0("matrix too large for Cholesky or eigen value decomposition. Increase 'max_chol' and 'max_svd' in 'RFoption' if necessary."); double optimal_work, *pt_work = &optimal_work; int k=0, optimal_intwork, *pt_iwork = &optimal_intwork, lw2 = -1, lintwork = -1; CMALLOC(U, sizeSq, double); CMALLOC(D, size, double); CMALLOC(xja, 2 * (Long) size, int); CMALLOC(w3, size, double); for (int i=0; i<=1; i++) { double dummy = 0.0, abstol = 0.0; int dummy_nr; // printf("%f %f %f %f\n", MPT[0], MPT[1], MPT[2], MPT[3]); F77dsyevr("V", "A", "U", &size, // Eigen MPT, &size, &dummy, &dummy, &k, &k, &abstol,// or DLAMCH &dummy_nr, D, U, &size, xja, // 2 * size * sizeof(integer); nonzeros_idx pt_work, &lw2, pt_iwork, &lintwork, &err #ifdef USE_FC_LEN_T FCONE FCONE FCONE #endif ); // printf("eigen %d %f %f; %e %e %e %e\n", err, D[0], D[1], U[0], U[1], U[2], U[3]); if (i==1 || err != NOERROR || ISNAN(D[0])) break; lw2 = (int) optimal_work; lintwork = (int) optimal_intwork; CMALLOC(w2, lw2, double); CMALLOC(iwork, lintwork, int); pt_iwork = iwork; pt_work = w2; } if (err != NOERROR) { if (PL>PL_ERRORS) { PRINTF("F77 error code for 'dsyevr' = %d\n", err);} CERR1("'dsyevr' failed with err=%d.", err); break; } for (Long i=0; i -eigen2zero * 100]); } if (calculate == MATRIXSQRT) { for (Long j=0; j= eigen2zero) dummy = SQRT(D[j]); for (Long i=0; idet_as_log); if (calculate == DETERMINANT) { err = NOERROR; goto ErrorHandling; } } // printf("Hiere EIGGEN\n"); bool pseudoInverse = false; for (Long j=0; j 0) { Long tot = (Long) size * rhs_cols; CMALLOC(w2, tot, double); matmulttransposed(U, RHS, w2, size, size, rhs_cols, cores); if (pseudoInverse) { for (k=0; k eigen2zero) { GERR0("singular matrix problem does not have a solution"); } k++; } } } else { for (k=0; kpseudoinverse) GERR0("Singular matrix: inverse does not exist. Consider 'RFoption(solve.pseudoinverse=TRUE)'"); CMALLOC(w2, sizeSq, double); for (Long k=0, j=0; j= PL_DETAILSUSER) { PRINTF("eigen value decomposition successful\n"); } break; } case SVD : {// SVD : M = U D VT if (size > sp->max_svd) CERR0("matrix too large for SVD decomposition."); int lw2 = -1, size8 = size * 8; double optim_lwork, eigen2zero = sp->eigen2zero, *pt_w2 = &optim_lwork; CMALLOC(w3, sizeSq, double); CMALLOC(U, sizeSq, double); CMALLOC(D, size, double); CMALLOC(iwork, size8, int); CMALLOC(lnz, size, double); for (Long i=0; i<=1; i++) { F77dgesdd("A", &size, &size, // SVD MPT, &size, D, U, &size, w3, &size, pt_w2, &lw2, iwork, &err #ifdef USE_FC_LEN_T FCONE #endif ); if (i==1 || err != NOERROR || ISNAN(D[0])) break; lw2 = (int) optim_lwork; CMALLOC(w2, lw2, double); pt_w2 = w2; } if (err != NOERROR) { if (PL>PL_ERRORS) { PRINTF("F77 error code for 'dgesdd' = %d\n", err);} CERR1("'dgesdd' failed with err=%d.", err); break; } if (calculate == MATRIXSQRT) { double svdtol = sp->svd_tol; /* calculate SQRT of covariance matrix */ for (Long j=0, k=0; j= eigen2zero) dummy = SQRT(D[j]); for (Long i=0; i 0.0) { for (Long i=0; i svdtol) { if (PL > PL_ERRORS) { PRINTF("difference %10e at (%d,%d) between the value (%10e) of the covariance matrix and the square of its root (%10e).\n", M0[i * size +k] - sum, i, k, M0[i*size+k], sum); } FERR3("required precision not attained (%10e > %10e): probably invalid model. See also '%.50s'.", FABS(M0[i * size + k] - sum), svdtol, solve[SOLVE_SVD_TOL]); err=ERRORM; break; } //else printf("ok (%d,%d) %10g %10g\n", i, k, M0[i*size+k],sum); } if (err != NOERROR) break; } if (err != NOERROR) break; } // end if svdtol > 0 } else { // calculate determinant if (logdet != NULL) { *logdet = cumProd(D, size, sp->det_as_log); if (calculate == DETERMINANT) { err = NOERROR; goto ErrorHandling; } } bool pseudoInverse = false; for (Long j=0; j 0) { Long tot = (Long) size * rhs_cols; CMALLOC(w2, tot, double); matmulttransposed(U, RHS, w2, size, size, rhs_cols,cores); if (pseudoInverse) { for (Long k=0; k eigen2zero) { GERR0("singular matrix problem does not have a solution."); } k++; } } } else { for (Long k=0; kpseudoinverse) GERR0("Singular matrix: inverse does not exist. Consider 'RFoption(solve.pseudoinverse=TRUE)'"); for (Long k=0, j=0; j= PL_DETAILSUSER) { PRINTF("svd successful\n"); } break; } case LU : {// LU //printf("LU\n"); if (calculate == MATRIXSQRT) { err = ERRORFAILED; continue; } CMALLOC(xja, size, int); F77dgetrf(&size, &size, MPT, &size, xja, &err); if (err != NOERROR) { CERR1("'dgetrf' (LU) failed with err=%d.", err); } //printf("LU %d\n", logdet != NULL); if (logdet != NULL) { *logdet = DeterminantLU(MPT, size, sp->det_as_log, xja); if (calculate == DETERMINANT) { err = NOERROR; goto ErrorHandling; } } if (rhs_cols > 0) { if (rhs_cols > MAXINT) BUG; int rhs_cols0 = (int) rhs_cols; Long totalRHS = (Long) size * rhs_cols; if (result != NULL) MEMCOPY(RESULT, RHS, sizeof(double) * totalRHS); F77dgetrs("N", &size, // LU rhs &rhs_cols0, MPT, &size, xja, RESULT, &size, &err #ifdef USE_FC_LEN_T FCONE #endif ); if (err != NOERROR) { CERR1("'dgetrs' (LU) failed with err=%d.", err); } } else { int lw2 = -1; double dummy, *p = &dummy; for (int i=0; i<=1; i++) { F77dgetri(&size, MPT, // LU solve &size, xja, p, &lw2, &err); if (err != NOERROR) break; lw2 = (int) dummy; CMALLOC(w2, lw2, double); p = w2; } } if (PL >= PL_DETAILSUSER) { PRINTF("LU decomposition successful\n"); } break; } case Sparse : {// sparse matrix if (sizeSq > MAXINT) BUG; Long halfsq = (Long) size * (size + 1) / 2; int nnzlindx = -1, doperm = sp->pivotsparse, nnzcolindices = 0, nnzR = 0, cache = 512, // to do: CPU cache size nnzcfact[3] = { 5, 1, 5 }, nnzRfact[3] = { 5, 1, 2 }; double cholincrease_nnzcol = 1.25, cholincrease_nnzR = 1.25; if (!posdef) CERR0("'spam' needs a positive definite matrix."); CMALLOC(pivotsparse, size, int); if (!doperm) for (int i=0; insuper = 0; // calculate spam_cholesky err = 4; // to get into the while loop while (err == 4 || err == 5) { if (nnzcolindices == 0) { double rel = nnzA / (double) size; if (rel < 5) { nnzcolindices = (int) CEIL(nnzA * (1.05 * rel - 3.8)); if (nnzcolindices < 1000) nnzcolindices = 1000; } else { nnzcolindices = nnzA; } nnzcolindices *= nnzcfact[doperm]; if (nnzcolindices < nnzA) nnzcolindices = nnzA; } else if (err == 5) { int tmp = (int) CEIL(nnzcolindices * cholincrease_nnzcol); if (PL > PL_RECURSIVE) { PRINTF("Increased 'nnzcolindices' with 'NgPeyton' method\n(currently set to %d from %d)", tmp, nnzR); } nnzcolindices = tmp; } if (nnzcolindices < pt->n_lindx) nnzcolindices = pt->n_lindx; if (nnzR == 0) { double u = FLOOR(.4 * POW(nnzA, 1.2)); u = u < 4 * nnzA ? 4 * nnzA : CEIL(u); nnzR = (int) u * nnzRfact[doperm]; } else if (err == 4) { int tmp = (int) CEIL(nnzR * cholincrease_nnzR); if (PL > PL_RECURSIVE) { PRINTF("Increased 'nnzR' with 'NgPeyton' method\n(currently set to %d from %d)", tmp, nnzR); } nnzR = tmp; } if (nnzR < pt->n_lnz) nnzR = pt->n_lnz; else if (nnzR > halfsq) nnzR = (int) halfsq; CMALLOC(lindx, nnzcolindices, int); CMALLOC(lnz, nnzR, double); F77cholstepwise(&size, &nnzA, D, cols, rows, &doperm, invp, pivotsparse, &nnzlindx, &nnzcolindices, lindx, // iwork,// &(pt->nsuper), // length of lindx &nnzR, // physical length of lindx lnz, // output:result xlnz, // cols of lnz "ja" snode, // supernode membership ?? xsuper, // supernode partioning &cache, // cache size of the CPU &err ); if (err != NOERROR) { CERR1("'cholstepwise' failed with err=%d.", err); break; } } // while if (err != NOERROR) CERR0("'spam' failed."); if (PL >= PL_DETAILSUSER) { PRINTF("'spam' successful\n"); } // spam solve if (calculate == MATRIXSQRT) { //BUG; // unexpected behaviour in spam nnzR = xlnz[size] - 1; CMALLOC(xja, nnzR, int); F77calcja(&size, &(pt->nsuper), pt->xsuper, pt->lindx, pt->iwork, pt->xlnz, xja); for (Long i=0; ilnz, xja, pt->xlnz, RESULT); for (Long i=0; ilnz; int *lindx = pt->lindx; // spam determinant if (logdet != NULL) { if (sp->det_as_log) { double tmp = 0.0; for (Long i=0; i MAXINT) BUG; RHS_COLS = (int) rhs_cols; if (result != NULL) MEMCOPY(RESULT, RHS, (Long) size * rhs_cols * sizeof(double)); } //printf("nsuper=%d\n", pt->nsuper); // for (Long ii=0; iinsuper, sizeP1, xsuper[ii], // w3[ii]); // if (false) // for (Long jsub=0; jsub<=pt->nsuper; jsub++) { // int fj = xsuper[1 - 1], // Lj = xsuper[jsub + 1 - 1] -1; // printf("%d %d %d\n", jsub, fj, Lj); // for (Long jcol=fj; jcol <= Lj; jcol++) { // printf("%d,%10g ", jcol, w3[jcol - 1]); // } // } // for (Long jcol=1; jcol <= 600; jcol++) { // w3[jcol - 1] = jcol; // printf("%d,%10g ", jcol, w3[jcol - 1]); // } // printf("%ld %ld %d\n", RESULT, rhs, rhs_cols); // for (Long ii=0; iinsuper), &RHS_COLS, lindx, // colindices iwork, //colpointers lnz, xlnz, // rowpointers invp, pivotsparse, xsuper, // supernodes w3, RESULT); if (PL >= PL_DETAILSUSER) { PRINTF("'spam' successful\n"); } } break; } // Sparse case NoInversionMethod: GERR0("no inversion method given."); case NoFurtherInversionMethod: STRCPY(ErrStr, WHICH_ERRORSTRING); GERR1("%.300s (All specified matrix inversion methods have failed.)", ErrStr); case direct_formula: case Diagonal: GERR1("strange method appeared:%.200s.", CONTACT); default : GERR1("unknown method (%d) in 'RandomFieldsUtils'.", pt->method); } // switch if (err==NOERROR) break; } // for m ErrorHandling: if (Pt == NULL) solve_DELETE(&pt); else Pt->sparse = sparse; // if (err != NOERROR) { printf("Err = %s %s %d\n", ErrStr, WHICH_ERRORSTRING ,err); exit(0);} return err; // -method; } SEXP doPosDef(SEXP M, SEXP rhs, SEXP logdet, int calculate, solve_storage *Pt, solve_options *Sp, int VARIABLE_IS_NOT_USED cores){ // rhs_cols == 0 iff RHS = NULL int rhs_rows, rhs_cols, size = ncols(M); if (nrows(M) != size) ERR0("not a square matrix"); int err = NOERROR; bool deleteMM = false, deleteRHS = false; SEXP res; solve_storage Pt0, *pt = Pt; if (pt == NULL) { solve_NULL(&Pt0); pt = &Pt0; } if (rhs == R_NilValue) { rhs_rows = rhs_cols = 0; } else if (isMatrix(rhs)) { rhs_rows = nrows(rhs); rhs_cols = ncols(rhs); } else if ((rhs_rows = length(rhs)) == 0) { rhs_cols = 0; } else { rhs_cols = 1; } if (rhs_rows > 0 && rhs_rows != size) ERR0("vector size does not match the matrix size"); int new_cols = rhs_cols == 0 ? size : rhs_cols; Long total = (Long) size * new_cols; // res = PROTECT(isReal(M) ? duplicate(M): coerceVector(M, REALSXP)); UNPROTECT(1); return res; if (rhs_cols==0 || isMatrix(rhs)) { res = PROTECT(allocMatrix(REALSXP, size, new_cols)); } else { res = PROTECT(allocVector(REALSXP, total)); } double *MM=NULL, *RHS = NULL; if (TYPEOF(M) != REALSXP) { if (TYPEOF(M) != INTSXP && TYPEOF(M) != LGLSXP) GERR0("numerical matrix expected"); if ((deleteMM = rhs_cols != 0)) MM = (double*) MALLOC(total * sizeof(double)); else MM = REAL(res); if (TYPEOF(M) == INTSXP) { for (Long i=0; i 0) { if ((deleteRHS = TYPEOF(rhs) != REALSXP)) { if (TYPEOF(rhs) != INTSXP && TYPEOF(rhs) != LGLSXP) GERR0("numerical matrix expected"); Long totalRHS = (Long) rhs_cols * rhs_rows; RHS = (double*) MALLOC(totalRHS * sizeof(double)); if (TYPEOF(rhs) == INTSXP) { for (Long i=0; i 0 && TYPEOF(rhs) == REALSXP) ? REAL(res) : NULL, length(logdet) == 0 ? NULL : REAL(logdet), calculate, pt, Sp, cores); ErrorHandling: if (deleteMM) { FREE(MM); } if (deleteRHS) { FREE(RHS); } if (pt != Pt) solve_DELETE0(pt); UNPROTECT(1); if (err != NOERROR) { const char *methname[] = {"solvePosDef", "cholesky", "determinant"}; errorstring_type msg; switch (err) { case ERRORMEMORYALLOCATION : STRCPY(msg, "memory allocation error"); break; case ERRORNOTPROGRAMMEDYET : STRCPY(msg, "not programmed yet"); break; case ERRORFAILED : STRCPY(msg, "algorithm has failed"); break; case ERRORM : STRCPY(msg, pt->err_msg); break; default: STRCPY(msg, ""); } RFERROR2("'%.200s': %.200s.\n", methname[calculate], msg); } return res; } SEXP SolvePosDefR(SEXP M, SEXP rhs, SEXP logdet){ KEY_type *KT = KEYT(); int cores = KT->global_utils.basic.cores; // rhs_cols == 0 iff RHS = NULL return doPosDef(M, rhs, logdet, SOLVE, NULL, &(OPTIONS.solve), cores); } int SolvePosDef(double *M, int size, bool posdef, double *rhs, Long rhs_cols, double *logdet, solve_storage *PT, int VARIABLE_IS_NOT_USED cores) { if ((rhs == NULL) xor (rhs_cols == 0)) BUG; return doPosDefIntern(M, size, posdef, rhs, rhs_cols, NULL, // result, so result returned in M or rhs logdet, SOLVE, // calculate PT, // storage &(OPTIONS.solve), cores); } int SolvePosDefSp(double *M, int size, bool posdef, double *rhs, Long rhs_cols, double *logdet, solve_storage *PT, solve_options * sp, int VARIABLE_IS_NOT_USED cores) { if ((rhs == NULL) xor (rhs_cols == 0)) BUG; return doPosDefIntern(M, size, posdef, rhs, rhs_cols, NULL, logdet, SOLVE, PT, sp, cores); } int xCinvYdet(double *M, int size, bool posdef, double *X, double *Y, Long cols, double *XCinvY, double *det, bool log, solve_storage *PT, int VARIABLE_IS_NOT_USED cores) { // called by randomfields int NR = KAHAN ? SCALAR_KAHAN : SCALAR_AVX; bool pt = PT != NULL && PT->result != NULL; double *result; if (pt) result=PT->result; else result= (double *) MALLOC(sizeof(double) * (Long) size * cols); if (result == NULL) return ERRORMEMORYALLOCATION; double *res = result; solve_options sp; MEMCOPY(&sp, &(OPTIONS.solve), sizeof(solve_options)); sp.det_as_log = log; int err = doPosDefIntern(M,// no PROTECT( needed size, posdef, Y, cols, result, det, SOLVE, PT, &sp, cores); for (Long i=0; iglobal_utils.basic.cores; solve_options sp; MEMCOPY(&sp, &(OPTIONS.solve), sizeof(solve_options)); sp.Methods[0] = sp.Methods[1] = Cholesky; sp.sparse = False; // currently does not work, waiting for Reinhard solve_storage Pt; solve_NULL(&Pt); SEXP Ans; PROTECT(Ans = doPosDef(M, R_NilValue, R_NilValue, MATRIXSQRT, &Pt, &sp, cores)); if (Pt.actual_pivot == PIVOT_DO || Pt.actual_pivot == PIVOT_IDX) { // NEVER: FREE(OPTIONS.solve.pivot_idx); See Pivot_Cholesky: SEXP Idx, Info1, Info3; PROTECT(Idx = allocVector(INTSXP, Pt.n_pivot_idx)); MEMCOPY(INTEGER(Idx), Pt.pivot_idx, sizeof(int) * Pt.n_pivot_idx); setAttrib(Ans, install("pivot_idx"), Idx); PROTECT(Info1 = allocVector(INTSXP, 1)); INTEGER(Info1)[0] = Pt.actual_size; setAttrib(Ans, install("pivot_actual_size"), Info1); PROTECT(Info3 = allocVector(INTSXP, 1)); INTEGER(Info3)[0] = PIVOT_DO; setAttrib(Ans, install("actual_pivot"), Info3); UNPROTECT(3); assert(Pt.n_pivot_idx == ncols(M)); } solve_DELETE0(&Pt); UNPROTECT(1); return Ans; } int chol(double *M, int size, solve_options *sp, int VARIABLE_IS_NOT_USED cores) { return doPosDefIntern(M, size, true, NULL, 0, NULL, NULL, MATRIXSQRT, NULL, sp, cores); } int cholesky(double *M, int size, int VARIABLE_IS_NOT_USED cores) { solve_options sp; MEMCOPY(&sp, &(OPTIONS.solve), sizeof(solve_options)); sp.Methods[0] = sp.Methods[1] = Cholesky; sp.sparse = False; // currently does not work, waiting for Reinhard return chol(M, size, &sp, cores); } bool Is_positive_definite(double *C, int dim, int VARIABLE_IS_NOT_USED cores) { // bool not allowed in C int err; Long bytes = sizeof(double) * dim * dim; double *test; test = (double*) MALLOC(bytes); MEMCOPY(test, C, bytes); err = cholesky(test, dim, cores); // printf("errr = %d\n", err); UNCONDFREE(test); return(err == NOERROR); } /* ## extrem wichter check -- folgendes funktioniert bislang bei spam nicht: library(RandomFields, lib="~/TMP") RFoptions(printlevel = 3, pch="", seed=999, use_spam = TRUE) #// z = RFsimulate(RMspheric(), x, max_variab=10000, n=10000, spC=F ALSE) C = cov(t(z)) c = RFcovmatrix(RMspheric(), x) #// p rint(summary(as.double(c - C))) ##// stopifnot(max(a b s(c-C)) < 0.05) */ int SqrtPosDefFree(double *M, // in out int size, solve_storage *pt, // in out solve_options *sp, int VARIABLE_IS_NOT_USED cores ){ int err; Long sizeSq = (Long) size * size; if (sp == NULL) sp = &(OPTIONS.solve); InversionMethod *Meth = sp->Methods; double *res = NULL; bool extra_alloc = Meth[0] == NoInversionMethod || Meth[0] == NoFurtherInversionMethod || (Meth[1] != NoInversionMethod && Meth[1] != NoFurtherInversionMethod && Meth[1] != Meth[0]) || (Meth[0] != Cholesky && Meth[0] != Eigen && Meth[0] != SVD); assert(pt != NULL); if (sp->sparse == True) warning("package 'spam' is currently not used for simulation"); sp->sparse = False; if (extra_alloc) { CMALLOC(result, sizeSq, double); res = result; } else { FREE(pt->result); pt->result = M; pt->n_result = sizeSq; } // it is ok to have // ==15170== Syscall param sched_setaffinity(mask) points to unaddressable byte(s) // caused by gcc stuff err = doPosDefIntern(M, size, true, NULL, 0, res, NULL, MATRIXSQRT, pt, sp, cores);// no PROTECT( needed if (extra_alloc) { #if defined MSDOS_WINDOWS pt->to_be_deleted = M; #else FREE(M); #endif } return err; } void sqrtRHS_Chol(double *U, int size, double* RHS, Long RHS_size, Long n, double *result, bool pivot, int act_size, int *pi) { // printf("n=%d,rhss=%d si=%d pivot=%d, act=%d U=%d RHS=%d %d pi=%d\n", // n, RHS_size, size,pivot, act_size, U!=NULL, RHS!=NULL, result!=NULL, pi!=NULL ); // for (Long i=0; i 0; int n = isMatrix(RHS) ? ncols(RHS) : 1, rows = isMatrix(RHS) ? nrows(RHS) : length(RHS), size = ncols(C), act_size =size; if (pivot) { SEXP dummy; PROTECT(dummy = getAttrib(C, install("pivot_actual_size"))); act_size=INTEGER(dummy)[0]; n_protect++; } int *pi = pivot ? (int *) INTEGER(Idx) : NULL; if (isMatrix(RHS)) PROTECT(Ans = allocMatrix(REALSXP, size, n)); else PROTECT(Ans = allocVector(REALSXP, size)); if (rows < act_size) ERR0("too few rows of RHS"); sqrtRHS_Chol(REAL(C), size, REAL(RHS), rows, n, REAL(Ans), pivot, act_size, pi); UNPROTECT(n_protect); return Ans; } SEXP chol2mv(SEXP C, SEXP N) { int n_protect = 2; SEXP Ans, Idx; PROTECT(Idx= getAttrib(C, install("pivot_idx"))); bool pivot = length(Idx) > 0; int n = INTEGER(N)[0], size = ncols(C), act_size = size; if (pivot) { SEXP dummy; PROTECT(dummy = getAttrib(C, install("pivot_actual_size"))); act_size = INTEGER(dummy)[0]; n_protect++; } Long n_act_size = (Long) n * act_size; int *pi = pivot ? INTEGER(Idx) : NULL; if (n == 1) PROTECT(Ans = allocVector(REALSXP, size)); else PROTECT(Ans = allocMatrix(REALSXP, size, n)); double *gauss = (double *) MALLOC(sizeof(double) * n_act_size); if (gauss == NULL) ERR0("memory allocation error"); GetRNGstate(); for (Long i=0; isize; switch (pt->method) { case Rcholesky : { //printf("RchoL\n"); int incx = 1; MEMCOPY(result, RHS, (Long) size * sizeof(double)); F77dtrmv("U", "T", "N", &size, pt->result, &size, result, &incx #ifdef USE_FC_LEN_T FCONE FCONE FCONE #endif ); } break; case GPUcholesky : // Alex: Gegebenenfalls schnelle GPU Version von Deiner Seite case direct_formula : case Cholesky : { // printf("intern\n"); bool pivot = (pt->actual_pivot == PIVOT_DO || pt->actual_pivot == PIVOT_IDX) && pt->method != direct_formula; if (pivot && pt->n_pivot_idx != size) BUG; sqrtRHS_Chol(pt->result, size, RHS, size, 1, result, pivot, pivot ? pt->actual_size : NA_INTEGER, pt->pivot_idx); return NOERROR; } case SVD : case Eigen : { double *U = pt->result; assert(U != NULL); #ifdef DO_PARALLEL #pragma omp parallel for num_threads(CORES) schedule(static) if (MULTIMINSIZE(size)) #endif for (Long i=0; iD != NULL); F77amuxmat(&size, &size, &one, RHS, pt->D, pt->lnz, pt->xja, pt->xlnz); for (Long i=0; iD[pt->invp[i]]; } break; case Diagonal : { Long i, j, sizeP1 = size + 1; double *D = pt->result; assert(D != NULL); for (i=j=0; jmethod %d\n", pt->method); BUG; } return NOERROR; } RandomFieldsUtils/src/options.h0000644000176200001440000001100214227157055016277 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de main library for unconditional simulation of random fields Copyright (C) 2021 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef rfutils_options_H #define rfutils_options_H 1 //#include "AutoRandomFieldsUtilsLocal.h" #include "RFU.h" #if defined SCHLATHERS_MACHINE #define DETERM_LAMODE false #else #define DETERM_LAMODE true #endif #define basicN 11 // IMPORTANT: all names of basic must be have least 3 letters !!! typedef // benoetigt struct basic_options { int Rprintlevel, Cprintlevel, seed, cores, efficient,//allow for different level later on dummy0[4]; bool skipchecks, helpinfo, asList /* hidden:verbose */, bigendian, warn_parallel, dummy6, dummy7; int dummy8[8]; } basic_options; #define basic_START \ { R_PRINTLEVEL, C_PRINTLEVEL, \ NA_INTEGER, INITCORES, \ true, /* different levels later on */ \ {0, 0, 0, 0}, \ false, true, true, false, \ false, false, false, \ {0,0,0,0, 0,0,0,0} \ } #define installNrunN 9 #define MAX_GPU_DEVICES 16 typedef // benoetigt #define INSTALL_RUN_WARN_OPTION 1 struct installNrun_options { int warn_unknown_option, LaMaxTakeIntern, gpu_devices[MAX_GPU_DEVICES], Ngpu_devices, maxStreams, dummy0[4]; install_modes install, dummy1; la_modes la_usr, la_mode, dummy2; usr_bool mem_is_aligned; bool installPackages, determineLAmode, kahanCorrection, dummy3, dummy4, dummy5, dummy6, dummy7; int dummy8[8]; } installNrun_options; #define installNrun_START \ { WARN_UNKNOWN_OPTION_ALL, MAXINT, \ {0}, 0, 0, \ {0, 0, 0, 0}, \ INSTALL_DEFAULT, Inone, \ LA_AUTO, LA_R, LA_AUTO, /*LA_R */ \ MEMisALIGNED, \ true, false, DETERM_LAMODE, \ false, \ false, false, false, false, \ {0,0,0,0, 0,0,0,0} \ } #define SOLVE_SVD_TOL 3 #define solveN 21 typedef // benoetigt struct solve_options { usr_bool sparse, pivot_check, dummy0, dummy1; bool det_as_log, pivot_partialdet, pseudoinverse, dummy2, dummy3; double spam_tol, spam_min_p[2], svd_tol, eigen2zero, pivot_relerror, max_deviation, max_reldeviation, dummy4[5]; InversionMethod Methods[SOLVE_METHODS], dummy5; int spam_min_n[2], spam_sample_n, spam_factor, pivotsparse, max_chol, max_svd, pivot, // obsolete actual_size, *pivot_idx, n_pivot_idx,//permutation; phys+logi laenge tinysize, dummy6[10]; // bool tmp_delete; pivot_modes actual_pivot,pivot_mode, dummy7; int dummy8[10]; } solve_options; #ifdef SCHLATHERS_MACHINE #define svd_tol_start 1e-08 #else #define svd_tol_start 0 #endif #define solve_START \ False, False, False, False, \ true, false, false, false, false, \ 2.220446e-16, {0.8, 0.9}, svd_tol_start, 1e-12, 1e-11, \ 1e-10, 1e-10, \ {0.0, 0.0, 0.0, 0.0, 0.0}, \ {NoInversionMethod, NoFurtherInversionMethod},NoInversionMethod, \ {400, 10000}, 500, 4294967, PIVOTSPARSE_MMD, 16384, \ 10000, /* never change -- see RFoptions.Rd */ \ PIVOT_NONE, /* obsolete */ \ 0, NULL, 0, 3, \ {0,0,0,0,0, 0,0,0,0,0}, \ PIVOT_UNDEFINED, PIVOT_AUTO, PIVOT_UNDEFINED, /* PIVOT_NONE */ \ {0,0,0,0,0, 0,0,0,0,0} typedef // benoetigt struct dummy_options { int dummy[30]; } dummy_options; typedef // benoetigt struct utilsoption_type{ basic_options basic; installNrun_options installNrun; solve_options solve; dummy_options dummy; } utilsoption_type; #if defined OBSOLETE_RFU && ! defined obsolete_miraculix #else #define ADD(ELT) SET_VECTOR_ELT(sublist, k++, ELT) #define ADDCHAR(ELT) x[0] = ELT; ADD(ScalarString(mkChar(x))) #endif //int own_chol_up_to(int size, int maxtime); //int own_chol_up_to(); void SetLaMode(); void SetLaMode(la_modes, int cores); void solve_DELETE0(solve_storage *x); void resetInstalled(); #endif RandomFieldsUtils/src/Basic_utils_local.h0000644000176200001440000000376014227157055020233 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2021 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef basic_rfutils_local_h #define basic_rfutils_local_h 1 #define RFU_LOCAL 1 #include "intrinsics.h" #include "Basic_utils.h" #define F77dgeqrf F77call(dgeqrf) #define F77dsyevr F77call(dsyevr) #define F77dgetrf F77call(dgetrf) #define F77dgetrs F77call(dgetrs) #define F77dgetri F77call(dgetri) #define F77dgesv F77call(dgesv) #define F77dpotrf F77call(dpotrf) #define F77dtrmv F77call(dtrmv) F77name(spamdnscsr)(int *nrow, int* ncol, double* dns, int* ndns, double* a, int* ja, int* ia, double* eps);// #define F77spamdnscsr F77call(spamdnscsr) F77name(cholstepwise)(int*, int*, double* , int*, int*, int*, int*, int*, int*, int*, int*, int*, int*, int*, double* , int*, int*, int*, int*, int*); #define F77cholstepwise F77call(cholstepwise) F77name(calcja)(int*, int*, int*, int*, int*, int*, int*); #define F77calcja F77call(calcja) F77name(spamcsrdns)(int*, double *, int *, int*, double* ); // ok #define F77spamcsrdns F77call(spamcsrdns) F77name(backsolves)(int*, int*, int*, int*, int*, double* , int*, int*, int*, int*, double* , double* ); #define F77backsolves F77call(backsolves) F77name(amuxmat)(int*, int*, int*, double* , double* , double* , int*, int*); #define F77amuxmat F77call(amuxmat) #endif RandomFieldsUtils/src/win_linux_aux.cc0000644000176200001440000000443114227157055017643 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Collection of system specific auxiliary functions Copyright (C) 2001 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "intrinsics.h" #if defined MSDOS_WINDOWS #define VC_EXTRALEAN #include #endif // achtung! windows.h zusammen mit // gibt warnung, da ERROR mehrfach definiert ! // deshalb auch in auxiliary.h nicht basic.h einbinden // obsolette ?!! #include #include #include "win_linux_aux.h" void sleepMilli(int *milli) { #if defined MSDOS_WINDOWS Sleep((long) *milli); // along #else usleep((useconds_t) (1000 * (unsigned long) *milli));// along #endif } void sleepMicro(int *micro) { #if defined MSDOS_WINDOWS Sleep((long) ((*micro + 500) / 1000));// along #else usleep((useconds_t) *micro); #endif } void pid(int *i) { #if defined MSDOS_WINDOWS *i = _getpid(); #else *i = getpid(); #endif } int parentpid=0; bool parallel() { int mypid; pid(&mypid); return mypid != parentpid; } void hostname(char **h, int *i){ #if defined MSDOS_WINDOWS *h[0]=0; #else gethostname(*h, *i); #endif } uint32_t cpuid_info(int Blatt, int Register) { #if defined MINGWCPUID uint32_t s[4]; __cpuid(Blatt, s[0], s[1], s[2], s[3]); return s[Register]; #elif defined WINCPUID uint32_t s[4]; __cpuid((int *)s, (int) Blatt); return s[Register]; #elif defined LINUXCPUID uint32_t s[4]; asm volatile ("cpuid": "=a"(s[0]), "=b"(s[1]),"=c"(s[2]), "=d"(s[3]):"a"(Blatt),"c"(0)); return s[Register]; #else return 0; #endif } RandomFieldsUtils/src/solve_61.cu0000644000176200001440000003735314227157055016443 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de (library for simulation of random fields) Copyright (C) 2021 -- 2021 Alexander Freudenberg This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include #include #include "parallel_simd.h" #ifdef TIME_AVAILABLE #include #endif #include "Basic_utils_local.h" #include "errors_messages.h" #include "RandomFieldsUtils.h" #include "solve_gpu.h" #include "options.h" #include "xport_import.h" ASSERT_SIMD(solve_61, gpu); __global__ void logdet_kernel(double *d_matrix, Uint *d_size, double *d_logdet){ __shared__ double logdet_loc; __shared__ double submatrix[THREADS_PER_BLOCK]; logdet_loc = 0.0; *d_logdet = 0.0; int idx = blockDim.x * blockIdx.x + threadIdx.x, thread = threadIdx.x; if(idx < *d_size){ // if(THREADS_PER_BLOCK<=thread && PL >= PL_RECURSIVE) // PRINTF("Size %d, access %d",THREADS_PER_BLOCK,thread ); submatrix[thread] = d_matrix[idx * (*d_size +1)]; } __syncthreads(); atomicAdd(&logdet_loc, idx >= *d_size ? 0 : (LOG(submatrix[thread]))); __syncthreads(); if (threadIdx.x == 0) {atomicAdd(d_logdet, logdet_loc); }; }; int cholGPU(bool copy, double *matrix, Uint input_size, double *B, Uint rhs_cols, double *LogDet, double *RESULT){ /* This function solves the problem A x = b on an available GPU and writes the solution to the original memory Input: matrix: pointer to rowwise allocated matrix A individuals: number of individuals in matrix, i.e. dimension vector: pointer to vector b Ouput: vector: contains solution x after the function has been called */ KEY_type *KT = KEYT(); installNrun_options *iNr = &(KT->global_utils.installNrun); int *devices = iNr->gpu_devices; int N = iNr->Ngpu_devices; assert(iNr->Ngpu_devices <= MAX_GPU_DEVICES); int maxStreams = iNr->maxStreams; #ifdef TIME_AVAILABLE clock_t start = clock(); #endif //declare/define process variables Ulong size = (Ulong) input_size; int bufferSize = 0; int *info = NULL; int h_info = 0; double *buffer = NULL; cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; cusolverDnHandle_t handle = NULL; cudaStream_t stream = NULL; //declare device variables double *d_matrix = NULL; double *d_B = NULL; double *d_logdet = NULL; Uint *d_size = NULL; //initialize handle and stream, calculate buffer size needed for cholesky cusolverDnCreate(&handle); cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); cusolverDnSetStream(handle, stream); cusolverDnDpotrf_bufferSize(handle, uplo, size, matrix, size, &bufferSize); cudaDeviceSynchronize(); //PRINTF("Buffersize: %f", ((float) bufferSize)/1073741824.0); cudaMalloc(&info, sizeof(int)); cudaMalloc(&buffer, sizeof(double) * bufferSize); //allocate memory on device cudaMalloc((void**)&d_matrix, sizeof(double) * size * size); cudaMalloc((void**)&d_B, sizeof(double) * size * rhs_cols); cudaMemset(info, 0, sizeof(int)); // if (PL > PL_RECURSIVE) // PRINTF("Size of alloc %ld", sizeof(double) * size * size); //copy data to device cudaMemcpy(d_matrix, matrix, sizeof(double) * size * size, cudaMemcpyHostToDevice); cudaMemcpy(d_B, B, sizeof(double) * size * rhs_cols, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); //write cholesky factorization to device copy of A cusolverDnDpotrf(handle, uplo, size, d_matrix, size, buffer, bufferSize, info); //Synchronize is necessary, otherwise error code "info" returns nonsense cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) PRINTF("%s\n", cudaGetErrorString(err)); //check for errors cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); if (0 != h_info) { if(h_info >0)PRINTF("Error: Cholesky factorization failed at minor %d \n", h_info); if(h_info <0)PRINTF("Error: Wrong parameter in cholesky factorization at %d entry\n", h_info); err = cudaDeviceReset(); if(err != cudaSuccess)PRINTF("Device reset not successful"); return(1); } //calculate x = A\b cusolverDnDpotrs(handle, uplo, size, rhs_cols, d_matrix, size, d_B, size, info); cudaDeviceSynchronize(); err = cudaGetLastError(); if (err != cudaSuccess) PRINTF("Potrs: %s\n", cudaGetErrorString(err)); if(LogDet != NULL){ cudaMalloc((void**)&d_logdet, sizeof(double)); cudaMalloc((void**)&d_size, sizeof(Uint)); cudaMemcpy(d_size, &size, sizeof(Uint), cudaMemcpyHostToDevice); logdet_kernel <<< (size - 1)/THREADS_PER_BLOCK +1 ,THREADS_PER_BLOCK>>> (d_matrix, d_size, d_logdet); cudaDeviceSynchronize(); cudaMemcpy(LogDet, d_logdet, sizeof(double), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); cudaFree(d_size); cudaFree(d_logdet); } err = cudaGetLastError(); if (err != cudaSuccess) PRINTF("Err at Logdet: %s\n", cudaGetErrorString(err)); //*LogDet = 1.0; //copy solution from device to vector on host cudaMemcpy(RESULT, d_B, sizeof(double) * size * rhs_cols, cudaMemcpyDeviceToHost); err = cudaGetLastError(); if (err != cudaSuccess) PRINTF("Memcpy: %s\n", cudaGetErrorString(err)); //free allocated memory cudaFree(info); cudaFree(buffer); cudaFree(d_matrix); cudaFree(d_B); cusolverDnDestroy(handle); cudaStreamDestroy(stream); #ifdef TIME_AVAILABLE // if (PL >= PL_RECURSIVE) // PRINTF("Time: %.3f", (double)(clock() - start) / CLOCKS_PER_SEC); #endif return 0; }; // void mgpuSolve(double *matrix, Uint individuals, double *vector){ // /* // This function solves the problem // A x = b // on an MULTIPLE GPUs and writes the solution to the original memory of b // Input: // matrix: pointer to rowwise allocated matrix A // individuals: number of individuals in matrix, i.e. dimension // vector: pointer to vector b // Ouput: // vector: contains solution x after the function has been called // */ // // Define auxiliary variables // cusolverMgHandle_t handle = NULL; // const int max_devices = 8; // Maximum number of devices to be used // int nbGpus = 0; // std::vector deviceList; // const int N = individuals, lda = N; // Dimension of matrix // const int IA = 1; // const int JA = 1; // const int T_A = 256; //Tile size // const int IB = 1; // const int JB = 1; // const int T_B = 1000, ldb = N; // int info = 0; // int64_t lwork_potrf = 0, lwork_potrs = 0, lwork = 0 ; // cudaLibMgMatrixDesc_t descrA, descrB; // cudaLibMgGrid_t grid; // double **array_d_A = NULL; // double **array_d_B = NULL; // double **array_d_work = NULL; // // Create handles and select devices // cusolverStatus_t status = cusolverMgCreate(&handle); // if(CUSOLVER_STATUS_SUCCESS != status)PRINTF("Handle couldn't be created"); // cudaError_t cudaStat = cudaGetDeviceCount( &nbGpus ); // nbGpus = (nbGpus < max_devices)? nbGpus : max_devices; // cudaDeviceProp prop; // cudaGetDeviceProperties(&prop, 0); // int cc_major = prop.major, cc_minor = prop.minor; // for(int i = 0; i< nbGpus; i++){ // cudaDeviceProp prop; // cudaGetDeviceProperties(&prop, i); // if(prop.major == cc_major & prop.minor == cc_minor) // deviceList.push_back(i); // } // nbGpus = deviceList.size(); // status = cusolverMgDeviceSelect( // handle, // nbGpus, // &deviceList[0]); // if(CUSOLVER_STATUS_SUCCESS != status) PRINTF("Devices couldn't be selected."); // // Enable peer access for selected devices // for(int i = 0; i < nbGpus; i++){ // cudaSetDevice(deviceList[i]); // for(int j = 0; j< nbGpus; j++){ // if(i == j)continue; // cudaStat = cudaDeviceEnablePeerAccess(deviceList[j],0); // if(cudaStat != cudaSuccess)PRINTF("Device %d can't access device %d.",deviceList[i],deviceList[j]); // PRINTF("Access enabled for devices (%d,%d)",deviceList[i],deviceList[j]); // } // } // // Create device grid for vectors A, B // status = cusolverMgCreateDeviceGrid(&grid, 1, nbGpus, &deviceList[0], CUDALIBMG_GRID_MAPPING_COL_MAJOR ); // if(CUSOLVER_STATUS_SUCCESS != status)PRINTF("Grid couldn't be created."); // // Creeate matrix descriptions // status = cusolverMgCreateMatrixDesc( // &descrA, // N, /* nubmer of rows of (global) A */ // N, /* number of columns of (global) A */ // N, /* number or rows in a tile */ // T_A, /* number of columns in a tile */ // CUDA_R_64F, // grid ); // if(CUSOLVER_STATUS_SUCCESS != status)PRINTF("Matrix descriptions couldn't be created."); // status = cusolverMgCreateMatrixDesc( // &descrB, // N, /* nubmer of rows of (global) B */ // 1, /* number of columns of (global) B */ // N, /* number or rows in a tile */ // T_B, /* number of columns in a tile */ // CUDA_R_64F, // grid ); // if(CUSOLVER_STATUS_SUCCESS != status)PRINTF("Matrix description B couldn't be created."); // // Allocate arrays of device pointers which point at the memory allocated on each device // array_d_A = (double**) MALLOC (sizeof(double*) * nbGpus ); // array_d_B = (double**)MALLOC(sizeof(double*)*nbGpus); // array_d_work = (double**)MALLOC(sizeof(double*)*nbGpus); // MEMSET(array_d_work, 0, sizeof(void*)*nbGpus); // // Calculate block size on device // const int A_num_blks = ( N + T_A - 1) / T_A; // const int B_num_blks = ( N + T_B - 1) / T_B; // const int A_blks_per_device = (A_num_blks + nbGpus-1)/nbGpus; // const int B_blks_per_device = (B_num_blks + nbGpus-1)/nbGpus; // // Allocate memory on each device // for( int p = 0 ; p < nbGpus ; p++){ // cudaSetDevice(deviceList[p]); // cudaStat = cudaMalloc( &(array_d_A[p]), sizeof(double)*lda*T_A*A_blks_per_device ); // if(cudaSuccess != cudaStat)PRINTF("Memory for matrix A couldn't be allocated on device %d.",deviceList[p]); // cudaStat = cudaMalloc( &(array_d_B[p]), sizeof(double)*ldb*T_B*B_blks_per_device ); // if(cudaSuccess != cudaStat)PRINTF("Memory for matrix B couldn't be allocated on device %d.",deviceList[p]); // } // // Copy arrays A and B to device // for( int k = 0 ; k < A_num_blks ; k++){ // /* k = ibx * nbGpus + p */ // const int p = (k % nbGpus); // const int ibx = (k / nbGpus); // double *h_Ak = matrix + (size_t)lda*T_A*k; // double *d_Ak = array_d_A[p] + (size_t)lda*T_A*ibx; // const int width = MIN( T_A, (N - T_A*k) ); // cudaStat = cudaMemcpy(d_Ak, h_Ak, sizeof(double)*lda*width, cudaMemcpyHostToDevice); // if(cudaSuccess != cudaStat)PRINTF("Matrix A couldn't be copied at block (%d, %d).", p,ibx); // } // for( int k = 0 ; k < B_num_blks ; k++){ // /* k = ibx * nbGpus + p */ // const int p = (k % nbGpus); // const int ibx = (k / nbGpus); // double *h_Bk = vector + (size_t) T_B*k; // double *d_Bk = array_d_B[p] + (size_t) T_B*ibx; // cudaStat = cudaMemcpy(d_Bk, h_Bk, sizeof(double)*T_B, cudaMemcpyHostToDevice); // if(cudaSuccess != cudaStat)PRINTF("Matrix B couldn't be copied at block (%d, %d).", p,ibx); // } // // Calculate buffersizes necessary for potrf and potrs // cudaDeviceSynchronize(); // status = cusolverMgPotrf_bufferSize( // handle, // CUBLAS_FILL_MODE_LOWER, // N, // (void**)array_d_A, // IA, /* base-1 */ // JA, /* base-1 */ // descrA, // CUDA_R_64F, // &lwork_potrf); // if(CUSOLVER_STATUS_SUCCESS != status)PRINTF("Buffer size potrf couldn't be calculated"); // cudaDeviceSynchronize(); // status = cusolverMgPotrs_bufferSize( // handle, // CUBLAS_FILL_MODE_LOWER, // N, // 1, /* number of columns of B */ // (void**)array_d_A, // IA, // JA, // descrA, // (void**)array_d_B, // IB, // JB, // descrB, // CUDA_R_64F, // &lwork_potrs); // cudaDeviceSynchronize(); // cudaError_t err = cudaGetLastError(); // if (err != cudaSuccess) PRINTF("Buffersize calculation: %s\n", cudaGetErrorString(err)); // if(CUSOLVER_STATUS_SUCCESS != status)PRINTF("Buffer size potrs couldn't be calculated"); // lwork = (lwork_potrf > lwork_potrs)? lwork_potrf : lwork_potrs; // // Allocate workspace size // for(int idx = 0 ; idx < nbGpus ; idx++){ // int deviceId = deviceList[idx]; // cudaSetDevice( deviceId ); // void *d_workspace = NULL; // cudaStat = cudaMalloc(&d_workspace, sizeof(double)*lwork); // if( cudaSuccess != cudaStat )PRINTF("Workspace couldn't be allocated."); // ((void**)array_d_work )[idx] = d_workspace; // } // // Calculate potrf to workspace // status = cusolverMgPotrf( // handle, // CUBLAS_FILL_MODE_LOWER, // N, // (void**)array_d_A, // IA, // JA, // descrA, // CUDA_R_64F, // (void**)array_d_work, // lwork, // &info /* host */ // ); // cudaDeviceSynchronize(); // if(CUSOLVER_STATUS_SUCCESS != status) PRINTF("Potrf couldn't be calculated"); // if(info != 0)PRINTF("Info code %d", info); // // Calculate potrs to B // status = cusolverMgPotrs( // handle, // CUBLAS_FILL_MODE_LOWER, // N, // 1, /* number of columns of B */ // (void**)array_d_A, // IA, // JA, // descrA, // (void**)array_d_B, // IB, // JB, // descrB, // CUDA_R_64F, // (void**)array_d_work, // lwork, // &info /* host */ // ); // cudaDeviceSynchronize(); // if(CUSOLVER_STATUS_SUCCESS != status) PRINTF("Potrs couldn't be calculated"); // if(info != 0)PRINTF("Info code %d", info); // // Copy solution B back to host // for( int k = 0 ; k < B_num_blks ; k++){ // /* k = ibx * nbGpus + p */ // const int p = (k % nbGpus); // const int ibx = (k / nbGpus); // double *h_Bk = vector + (size_t) T_B*k; // double *d_Bk = array_d_B[p] + (size_t) T_B*ibx; // cudaStat = cudaMemcpy(h_Bk, d_Bk, sizeof(double)*T_B, cudaMemcpyDeviceToHost); // if(cudaSuccess != cudaStat)PRINTF("Matrix B couldn't be copied at block (%d, %d).", p,ibx); // } // // Free memory on device and host // for(int i = 0; i< nbGpus; i++){ // cudaSetDevice(deviceList[i]); // cudaDeviceReset(); // } // FREE(array_d_A); FREE(array_d_B); FREE(array_d_work); // } RandomFieldsUtils/src/sort.cc0000644000176200001440000004010014227157055015732 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2017 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "Basic_utils_local.h" // must be before anything else #include "RandomFieldsUtils.h" #include "zzz_RandomFieldsUtils.h" typedef bool (*vergleich)(int, int, void *O); bool smaller1(int i, int j, void *orderd) { return ((double *) orderd)[i] < ((double *) orderd)[j]; } bool greater1(int i, int j, void *orderd) { return ((double *) orderd)[i] > ((double *) orderd)[j]; } bool smallerInt1(int i, int j, void *orderedint) { return ((int *) orderedint)[i] < ((int *)orderedint)[j]; } bool greaterInt1(int i, int j, void *orderedint) { return ((int *)orderedint)[i] > ((int *)orderedint)[j]; } typedef bool (*vergleichX)(int, int, int, void *O); // vergleichX SMALLERX=NULL, GREATERX=NULL; bool smaller(int i, int j, int orderdIM, void *O) { double *x, *y, *orderd = (double*) O; int d; x = orderd + i * orderdIM; y = orderd + j * orderdIM; for(d=0; d y[d]; return false; } bool smallerInt(int i, int j, int orderdIM, void *O) { int *x, *y, *orderedint = (int*) O; int d; x = orderedint + i * orderdIM; y = orderedint + j * orderdIM; for(d=0; d y[d]; return false; } void order(int *pos, int start, int end, vergleich SMALLER, vergleich GREATER, void * orderd, int order_from, int order_to) { int randpos, pivot, left, right, pivotpos, swap; if( start < end ) { //Get RNGstate();randpos = start + (int) (UNIFORM_RANDOM * (end-start+1)); PutRNGstate(); // use Get/Put RNGstate with great care !! randpos = (start + end) / 2; pivot = pos[randpos]; pos[randpos] = pos[start]; pos[start] = pivot; pivotpos=start; left = start; right=end+1; while (left < right) { //printf("order > %ld start=%d %d left=%d %d %d pivot=%d\n", pos, start, end, left, right, pos[left], pivot); while (++left < right && SMALLER(pos[left], pivot, orderd)) pivotpos++; while (--right > left && GREATER(pos[right], pivot, orderd)); if (left < right) { swap=pos[left]; pos[left]=pos[right]; pos[right]=swap; pivotpos++; } } pos[start] = pos[pivotpos]; pos[pivotpos] = pivot; if (start <= order_to && pivotpos > order_from) order(pos, start, pivotpos-1, SMALLER, GREATER, orderd, order_from, order_to); if (pivotpos < order_to && end >= order_from) order(pos, pivotpos + 1, end, SMALLER, GREATER, orderd, order_from, order_to); } } void Xorder(int *pos, int start, int end, vergleichX SMALLER,vergleichX GREATER, int D, void * orderd, int order_from, int order_to ) { int randpos, pivot, left, right, pivotpos, swap; if( start < end ) { //Get RNGstate();randpos = start + (int) (UNIFORM_RANDOM * (end-start+1)); PutRNGstate(); // use Get/Put RNGstate with great care !! randpos = (start + end) / 2; pivot = pos[randpos]; pos[randpos] = pos[start]; pos[start] = pivot; pivotpos=start; left = start; right=end+1; while (left < right) { //printf("order > %ld start=%d %d left=%d %d %d pivot=%d\n", pos, start, end, left, right, pos[left], pivot); while (++left < right && SMALLER(pos[left], pivot, D, orderd)) pivotpos++; while (--right > left && GREATER(pos[right], pivot, D, orderd)); if (left < right) { swap=pos[left]; pos[left]=pos[right]; pos[right]=swap; pivotpos++; } } pos[start] = pos[pivotpos]; pos[pivotpos] = pivot; if (start <= order_to && pivotpos > order_from) Xorder(pos, start, pivotpos-1, SMALLER, GREATER, D, orderd, order_from, order_to); if (pivotpos < order_to && end >= order_from) Xorder(pos, pivotpos + 1, end, SMALLER, GREATER, D, orderd, order_from, order_to); } } void orderingFromTo(double *d, int len, int dim, int *pos, int from, int to, usr_bool NAlast) { int start, end; if (NAlast == Nan) { for (int i=0; i start=%d %d left=%d %d %10g pivot=%10g\n", start, end, left, right, orderd[left], pivot); while (++left < right && orderd[left] < pivot) pivotpos++; while (--right > left && orderd[right] > pivot); if (left < right) { double swap = orderd[left]; orderd[left]=orderd[right]; orderd[right]=swap; pivotpos++; } } orderd[start] = orderd[pivotpos]; orderd[pivotpos] = pivot; if (start <= order_to && pivotpos > order_from) quicksort(start, pivotpos-1, orderd, order_from, order_to); if (pivotpos < order_to && end >= order_from) quicksort(pivotpos + 1, end, orderd, order_from, order_to); } } void sortingFromTo(double *d, int len, int from, int to, usr_bool NAlast) { int start, end; if (NAlast == Nan) { end = len-1; start = 0; } if (NAlast == True) { start = end = 0; int NAend = len - 1; while (end < NAend) { while (NAend >= 0 && (ISNA(d[NAend]) || ISNAN(d[NAend]))) NAend--; while (end < NAend && !ISNA(d[end]) && !ISNAN(d[end])) end++; if (end < NAend) { double swap = d[end]; d[end] = d[NAend]; d[NAend--] = swap; } } assert(NAend == end && false); } else { // if (NAlast == False) { start = end = len - 1; int NAstart = 0; while (start > NAstart) { while(NAstart < len && (ISNA(d[NAstart]) || ISNAN(d[NAstart]))) NAstart++; while (start > NAstart && !ISNA(d[start]) && !ISNAN(d[start])) start--; // printf("s = %d\n", start); if (start > NAstart) { double swap = d[start]; d[start] = d[NAstart]; d[NAstart++] = swap; } } assert(NAstart == start); } quicksort(start, end, d, from - 1, to - 1); // for (int i=0; i left && orderedint[right] > pivot); if (left < right) { int swap = orderedint[left]; orderedint[left]=orderedint[right]; orderedint[right]=swap; pivotpos++; } } orderedint[start] = orderedint[pivotpos]; orderedint[pivotpos] = pivot; if (start <= order_to && pivotpos > order_from) sortInt(start, pivotpos-1, orderedint, order_from, order_to); if (pivotpos < order_to && end >= order_from) sortInt(pivotpos + 1, end, orderedint, order_from, order_to); } } void sortingIntFromTo(int *d, int len, int from, int to, usr_bool NAlast) { /* quicksort algorithm, slightly modified: does not sort the data, but d[pos] will be ordered NOTE: pos must have the values 0,1,2,...,start-end ! (orderdouble is a kind of sorting pos according to the variable d) */ int start, end; if (NAlast == Nan) { end = len-1; start = 0; } if (NAlast == True) { start = end = 0; int NAend = len - 1; while (end < NAend) { while (NAend >= 0 && d[NAend] == NA_INTEGER) NAend--; while (end < NAend && d[end] != NA_INTEGER) end++; if (end < NAend) { int swap = d[end]; d[end] = d[NAend]; d[NAend--] = swap; } } assert(NAend == end && false); } else { // if (NAlast == False) { start = end = len - 1; int NAstart = 0; while (start > NAstart) { while(NAstart < len && d[NAstart] == NA_INTEGER) NAstart++; while (start > NAstart && d[start] != NA_INTEGER) start--; if (start > NAstart) { double swap = d[start]; d[start] = d[NAstart]; d[NAstart++] = swap; } } assert(NAstart == start); } sortInt(start, end, d, from - 1, to - 1); } void sortingInt(int *d, int len, usr_bool NAlast) { sortingIntFromTo(d, len, 1, len, NAlast); } SEXP sortX(SEXP Data, SEXP From, SEXP To, SEXP NAlast) { if (length(Data) > MAXINT) BUG; int err = NOERROR, len = length(Data), from = MAX(1, INTEGER(From)[0]), to = MIN(INTEGER(To)[0], len); if (from > to) return R_NilValue; usr_bool nalast; if (LOGICAL(NAlast)[0] == NA_LOGICAL) nalast = Nan; else nalast = LOGICAL(NAlast)[0] ? True : False; SEXP Ans; if (TYPEOF(Data) == REALSXP) { // printf("%d %d %d %d\n", from, to, INTEGER(To)[0], len); PROTECT(Ans=allocVector(REALSXP, to - from + 1)); int bytes = len * sizeof(double); double *data; if ((data = (double*) MALLOC(bytes)) == NULL) { err = ERRORMEMORYALLOCATION; goto ErrorHandling; } MEMCOPY(data, REAL(Data), bytes); sortingFromTo(data, len, from, to, nalast); from--; double *ans; ans = REAL(Ans); for (int i=from; i MAXINT) BUG; int err = NOERROR, len = length(Data), from = MAX(1, INTEGER(From)[0]), to = MIN(INTEGER(To)[0], len); if (from > to) return R_NilValue; SEXP Ans; PROTECT(Ans=allocVector(INTSXP, to - from + 1)); usr_bool nalast; if ( LOGICAL(NAlast)[0] == NA_LOGICAL) nalast = Nan; else nalast = LOGICAL(NAlast)[0] ? True : False; int bytes = len * sizeof(int), *pos = (int*) MALLOC(bytes); if (pos == NULL) {err = ERRORMEMORYALLOCATION; goto ErrorHandling;} if (TYPEOF(Data) == REALSXP) { // printf("%d %d %d %d\n", from, to, INTEGER(To)[0], len); orderingFromTo(REAL(Data), len, 1, pos, from, to, nalast); } else if (TYPEOF(Data) == INTSXP) { orderingIntFromTo(INTEGER(Data), len, 1, pos, from, to, nalast); } else { err = ERRORFAILED; goto ErrorHandling; } from--; int *ans; ans = INTEGER(Ans); for (int i=from; i to) return R_NilValue; int *pos = (int*) MALLOC(len * sizeof(int)); usr_bool nalast = LOGICAL(NAlast)[0] == NA_LOGICAL ? Nan : LOGICAL(NAlast)[0] ? True : False; SEXP Ans; if (TYPEOF(Data) == REALSXP) { // printf("%d %d %d %d\n", from, to, INTEGER(To)[0], len); PROTECT(Ans=allocVector(REALSXP, to - from + 1)); double *ans = REAL(Ans), *data = REAL(Data); ordering(data, len, dim, pos, from, to, nalast); from--; for (int i=from; i // uintptr_t #include "def.h" #include "parallel_simd.h" #if defined MINGWCPUID #include #elif defined WINCPUID //#warning loading intrin.h the first time #include #endif //#if defined _ _ARM_NEON //#include //#if defined(_ _LP64_ _) && _ _LP64_ _ //#endif //#endif #if ! defined MEMisALIGNED #define MEMisALIGNED Nan #endif #if defined ARM32 && defined SSE2 #include "sse2neon.h" #elif defined AVX || defined SSE2 //|| defined AVX2 || defined #include #endif #if __GNUC__ > 4 || \ (__GNUC__ == 4 && (__GNUC_MINOR__ > 9 || \ (__GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ >= 1))) //#define OpenMP4 1 #endif union uni32{ uint32_t vi; float f4[1]; uint32_t u32[1]; uint16_t u16[2]; uint8_t u8[4]; }; union uni64{ uint64_t vi; uint64_t u64[1]; double d8[1]; float f4[2]; uint32_t u32[2]; uint16_t u16[4]; uint8_t u8[8]; }; union uni128{ #if defined SSE2 __m128i vi; __m128d d; __m128 f; __m128d d128[1]; #endif uint64_t u64[2]; uint32_t u32[4]; uint8_t u8[16]; // __m64 m64[2]; double halfd[2], d8[2]; float halff[4], f4[4]; }; union uni256 { #if defined AVX2 __m256i vi; __m256d d; __m256 f; #endif #if defined SSE2 || defined AVX2 __m128i i128[2]; __m128d d128[2]; __m128d halfd[2]; __m128 halff[2]; #endif uint64_t u64[4]; uint32_t u32[8]; uint8_t u8[32]; // __m64 m64[4]; double d8[4]; float f4[8]; }; union uni512 { #if defined AVX512 __m512i vi; __m512d d; __m512 f; #endif #if defined AVX2 || defined AVX512 __m256i i256[2]; __m256d d256[2]; __m256d halfd[2]; __m256 halff[2]; #endif #if defined SSE2 || defined AVX2 || defined AVX512 __m128i i128[4]; __m128d d128[4]; __m128 f128[4]; #endif uint64_t u64[8]; uint32_t u32[16]; uint8_t u8[64]; // __m64 m64[4]; double d8[8]; float f4[16]; }; #define BitsPerByte 8U #if defined AVX512 #define SIMD_AVAILABILITY avx512f #define SSEBITS 512U #define SSEMODE 30U #define BlockType0 __m512i #define BlockType __m512i ALIGNED #define UnionType0 uni512 #define Double __m512d #define LOADuDOUBLE _mm512_loadu_pd #define LOADU _mm512_loadu_si512 // _mm512_lddqu_si512 #if defined MEM_IS_ALIGNED #define LOADDOUBLE _mm512_load_pd #define LOAD _mm512_load_si512 #else #define LOAD LOADU #define LOADDOUBLE LOADuDOUBLE #endif #define MAXDOUBLE _mm512_max_pd #define ADDDOUBLE _mm512_add_pd #define SUBDOUBLE _mm512_sub_pd #define MULTDOUBLE _mm512_mul_pd #define STOREuDOUBLE _mm512_storeu_pd #define ZERODOUBLE _mm512_setzero_pd #define MULTFLOAT _mm512_mul_ps #define ADDFLOAT _mm512_add_ps #define SUBFLOAT _mm512_sub_ps #define ZEROFLOAT _mm512_setzero_ps //#define BLENDFLOAT _mm256_blend_ps //#define DUPLICATEFLOAT _mm512_moveldup_ps #define MASK0ADDDOUBLE(A,M,B) _mm512_maskz_add_pd(A, M, A, B) // #define BLENDDOUBLE _mm256_blend_pd #define DUPLICATEDOUBLE _mm512_movedup_pd #define MAXINTEGER _mm512_max_epi32 #define AND _mm512_and_si512 #define OR _mm512_or_si512 #define XOR _mm512_xor_si512 #define ANY(A) (! _mm512_kortestz(_mm512_test_epi32_mask(A, A), _mm512_test_epi32_mask(A, A))) #define SHR32 _mm512_srli_epi32 // see also _mm512512_rol_epi64, #define SHL32 _mm512_slli_epi32 #define SHR16 _mm512_srli_epi16 #define SHR64 _mm512_srli_epi64 #define SHL64 _mm512_slli_epi64 #define SET16 _mm512_set1_epi16 #define SET32 _mm512_set1_epi32 #define SET64 _mm512_set1_epi64 // oder _m512d _mm512_set1_pd (double a) #define ZERO _mm512_setzero_si512 #define STORE_DOUBLE _mm512_store_pd //#define EXTRACT16 _mm512_extract_epi16 #define ADD32 _mm512_add_epi32 #define MADD16 _mm512_madd_epi16 #define ADD64 _mm512_add_epi64 #define MULT32 _mm512_mullo_epi32 #define SET8 _mm512_set1_epi8 // nicht! BW #define SETREV8( B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0) \ _mm512_set_epi8(B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0, \ B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0, \ B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0, \ B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0) #if defined AVX512BW #define ADD8 _mm512_add_epi8 #define SAD8 _mm512_sad_epu8 #define SHUFFLE8 _mm512_shuffle_epi8 #elif #define LOWER256(A) (__m256i) _mm512_extractf64x4_pd((__m512d) (A), 0) #define UPPER256(A) (__m256i) _mm512_extractf64x4_pd((__m512d) (A), 1) #define DO_256(X, A, B) \ _mm512_inserti64x4(_mm512_zextsi256_si512(X(LOWER256(A), LOWER256(B))), \ X(UPPER256(A), UPPER256(B)), 1) #define ADD8(A, B) DO_256(_mm256_add_epi8, A, B) #define SAD8(A, B) DO_256(_mm256_sad_epu8, A, B) #define SHUFFLE8 DO_256(_mm256_shuffle_epi8, A, B) #endif #elif defined AVX #define SSEBITS 256U #define SSEMODE 20U #define BlockType0 __m256i #define BlockType __m256i ALIGNED #define UnionType0 uni256 #define Double __m256d #define LOADuDOUBLE _mm256_loadu_pd #if defined MEM_IS_ALIGNED #define LOADDOUBLE _mm256_load_pd #else #define LOADDOUBLE LOADuDOUBLE #endif #define MAXDOUBLE _mm256_max_pd #define ADDDOUBLE _mm256_add_pd #define SUBDOUBLE _mm256_sub_pd #define MULTDOUBLE _mm256_mul_pd #define STOREuDOUBLE _mm256_storeu_pd #define ZERODOUBLE _mm256_setzero_pd #define MULTFLOAT _mm256_mul_ps #define ADDFLOAT _mm256_add_ps #define SUBFLOAT _mm256_sub_ps #define ZEROFLOAT _mm256_setzero_ps #define BLENDFLOAT _mm256_blend_ps #define DUPLICATEFLOAT _mm256_moveldup_ps #define MASK0ADDDOUBLE(A,M,B) _mm256_maskz_add_pd(A, M, A, B) #define BLENDDOUBLE _mm256_blend_pd #define DUPLICATEDOUBLE _mm256_movedup_pd #if defined AVX2 #define LOADU _mm256_loadu_si256 // _mm256_lddqu_si256 #if defined MEM_IS_ALIGNED #define LOAD _mm256_load_si256 // _mm256_lddqu_si256 #else #define LOAD LOADU #endif #define MAXINTEGER _mm256_max_epi32 #define AND _mm256_and_si256 #define OR _mm256_or_si256 #define XOR _mm256_xor_si256 #define ANY(A) (!_mm256_testz_si256(A, A)) #define SHR32 _mm256_srli_epi32 // see also _mm256512_rol_epi64, #define SHL32 _mm256_slli_epi32 #define SHR16 _mm256_srli_epi16 #define SHR64 _mm256_srli_epi64 #define SHL64 _mm256_slli_epi64 #define SHUFFLE8 _mm256_shuffle_epi8 #define SET8 _mm256_set1_epi8 #define SETREV8( B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0) \ _mm256_setr_epi8(B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0, \ B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0) #define SET16 _mm256_set1_epi16 #define SET32 _mm256_set1_epi32 #define SET64 _mm256_set1_epi64x // oder _m256d _mm256_set1_pd (double a) #define ZERO _mm256_setzero_si256 #define STORE_DOUBLE _mm256_store_pd #define EXTRACT16 _mm256_extract_epi16 #define ADD8 _mm256_add_epi8 #define ADD32 _mm256_add_epi32 #define MADD16 _mm256_madd_epi16 #define ADD64 _mm256_add_epi64 #define SAD8 _mm256_sad_epu8 #define MULT32 _mm256_mullo_epi32 #define SIMD_AVAILABILITY avx2 #else #define SIMD_AVAILABILITY avx #define MAXINTEGER _mm_max_epi32 #endif #elif defined SSE2 #define SSEBITS 128U #define SSEMODE 10U #define BlockType0 __m128i #define BlockType __m128i ALIGNED #define UnionType0 uni128 #define Double __m128d #define LOADU _mm_loadu_si128 #define LOADuDOUBLE _mm_loadu_pd #if defined MEM_IS_ALIGNED #define LOADDOUBLE _mm_load_pd #define LOAD _mm_load_si128 #else #define LOAD LOADU #define LOADDOUBLE LOADuDOUBLE #endif #define MAXDOUBLE _mm_max_pd #define MAXINTEGER _mm_max_epi32 #define ADDDOUBLE _mm_add_pd #define SUBDOUBLE _mm_sub_pd #define MULTDOUBLE _mm_mul_pd #define STOREuDOUBLE _mm_storeu_pd #define ZERODOUBLE _mm_setzero_pd #define MULTFLOAT _mm_mul_ps #define ADDFLOAT _mm_add_ps #define SUBFLOAT _mm_sub_ps #define ZEROFLOAT _mm_setzero_ps #define BLENDFLOAT _mm_blend_ps #define DUPLICATEFLOAT _mm_moveldup_ps #define AND _mm_and_si128 #define OR _mm_or_si128 #define XOR _mm_xor_si128 bool any128(__m128i A); #define ANY(A) any128(A) #define SHR32 _mm_srli_epi32 // see also _mm512_rol_epi64, #define SHL32 _mm_slli_epi32 #define SHR16 _mm_srli_epi16 #define SHR64 _mm_srli_epi64 #define SHL64 _mm_slli_epi64 #define SET8 _mm_set1_epi8 #define SETREV8 _mm_setr_epi8 #define SET16 _mm_set1_epi16 #define SET32 _mm_set1_epi32 #define SET64 _mm_set1_epi64x #define ZERO _mm_setzero_si128 #define STORE_DOUBLE _mm_store_pd #define EXTRACT16 _mm_extract_epi16 #define ADD8 _mm_add_epi8 #define ADD32 _mm_add_epi32 #define ADD64 _mm_add_epi64 #define MADD16 _mm_madd_epi16 #define SAD8 _mm_sad_epu8 // _pu8? #define INT2FLOAT _mm_cvtepi32_ps #define INT2DOUBLE _mm_cvtpi32_pd // very expensive #define BLENDDOUBLE _mm_blend_pd #define DUPLICATEDOUBLE _mm_movedup_pd //#define MOVEMASK _mm_movemask_ps //#define BLEND _mm_blend_pd //see also _mm512_mask_inserti64x4_mm_insert_epi64 #if defined SSSE3 // within SSE2 #define SIMD_AVAILABILITY sse2 #define SHUFFLE8 _mm_shuffle_epi8 #else #define SIMD_AVAILABILITY ssse3 #endif #elif defined MMX || defined PlainInteger64 // particularly Bit23 #define SIMD_AVAILABILITY no_sse #define SSEBITS 64U #define SSEMODE 0U #define BlockType0 uint64_t #define BlockType BlockType0 #define UnionType0 uni64 #define AND(B,C) (B) & (C) #define OR(B,C) (B) | (C) #define XOR(B,C) (B) xor (C) #define SHR64(B,C) (B) >> (C) #define SHR32 SHR64 // unsafe #define SHR16 SHR64 // unsafe #define SHL64(B,C) (B) << (C) #define SHL32 SHL64 // unsafe #define SHL16 SHL64 // unsafe #define SET32 (Ulong) 0x0000000100000001L * (Ulong) #define ADD64(B,C) (B) + (C) #define ADD32 ADD64 // unsafe #define ZERO() 0L #define LOADU(A) *(A) #define LOAD LOADU #define SET8(A) (((BlockType0) (A)) * ((BlockType0) 0x0101010101010101L)) #if defined MMX #define ADD8(B,C) (BlockType0) _mm_add_pi8((__m64) B, (__m64) C) #else #define ADD8(B,C) (((BlockType0) (B)) + ((BlockType0) (C))) // unsafe #endif #define ANY #else #define SIMD_AVAILABILITY no_sse #define SSEBITS 32U #define SSEMODE 0U #define BlockType0 uint32_t #define BlockType BlockType0 #define UnionType0 uni32 #if defined PlainInteger32 #define AND(B,C) (B) & (C) #define OR(B,C) (B) | (C) #define XOR(B,C) (B) xor (C) #define SHR32(B,C) (B) >> (C) #define SHR16 SHR32 // unsafe #define SHL32(B,C) (B) << (C) #define SHL16 SHL32 // unsafe #define SHL64 SHL32 // unsafe #define SET32 #define ADD64(B,C) (B) + (C) #define ZERO() 0U #define ANY #define LOADU(A) *(A) #define LOAD LOADU #define SET8(A) (((BlockType0) (A)) * ((BlockType0) 0x01010101U)) #define ADD8(B,C) (((BlockType0) (B)) + ((BlockType0) (C))) // unsafe ! #else #if defined __GNUC__ && defined SCHLATHERS_MACHINE #warning No specification of any SIMD. #endif #endif #endif // AVX512 .. PlaintInteger32 #if defined AVX #define SCALAR_DEFAULT SCALAR_NEARFMA #else #define SCALAR_DEFAULT SCALAR_BASE #endif #define BytesPerBlock (SSEBITS / BitsPerByte) #define ALIGNED __attribute__ ((aligned (BytesPerBlock))) #define doubles (BytesPerBlock / 8) #define integers (BytesPerBlock / 4) /////////////////////////////////////////////////////////////////////// // checks whether current hardware matches the compilation // * mainly intel (and amd) cores // * but also GPU /////////////////////////////////////////////////////////////////////// #define noMISS 0U #define noUSE 0U #define anyrelevantUSE 0U #define gpuUSE 1U #define avx2USE 2U #define avxUSE 3U #define ssse3USE 4U #define sse2USE 5U #define avx512fUSE 6U #define USEnMISS 10U #define gpuMISS 11U #define avx2MISS 12U #define avxMISS 13U #define ssse3MISS 14U #define sse2MISS 15U #define avx512fMISS 16U #define anyMISS (1 << gpuMISS) | (1 << avx2MISS) | (1 << avxMISS) | \ (1 << ssse3MISS) | (1 << sse2MISS) | (1 << avx512fMISS) #define SIMD_INFO \ allmiss | alluse | (HAS_PARALLEL || alluse != 0) * (1 << anyrelevantUSE) | \ ((HAS_PARALLEL || alluse != noUSE) && !(HAS_PARALLEL && allmiss==noMISS)) * \ (1 << USEnMISS) #if defined EAX #if EAX != 0 #define EXX_REDEFINED 1 #endif #else #define EAX 0 #endif #if defined EBX #if EBX != 1U #define EXX_REDEFINED 1 #endif #else #define EBX 1 #endif #if defined ECX #if ECX != 2 #define EXX_REDEFINED 1 #endif #else #define ECX 2 #endif #if defined EDX #if EDX != 3 #define EXX_REDEFINED 1 #endif #else #define EDX 3 #endif //#define sse3 Available(1, ECX,0) #define no_sseAvail true #define no_sseMISS 999U #define no_sseUSE 999U #define ssse3Avail Available(1, ECX,9) #define sse41Avail Available(1, ECX,19) #define avxAvail Available(1, ECX,28) #define sseAvail Available(1, EDX,25) #define sse2Avail Available(1, EDX,26) #define avx2Avail Available(7, EBX,5) #define avx512fAvail Available(7, EBX,16) #define avx512dqAvail Available(7, EBX, 17) #define avx512pfAvail Available(7, EBX,26) #define avx512erAvail Available(7, EBX,27) #define avx512cdAvail Available(7, EBX,28) #define avx512bwAvail Available(7, EBX,30) #define avx512vlAvail Available(7, EBX,31) #define avx512vbmiAvail Available(7, ECX, 1) #define avx512vmbi2Avail Available(7, ECX, 6) #define avx512vnniAvail Available(7, ECX, 11) #define avx512bitalgAvail Available(7, ECX, 12) #define avx512popcntAvail Available(7, ECX, 14) #define avx512intersectAvail Available(7, EDX, 8) #define avx512fp16Avail Available(7, EDX, 23) #define avx512bf16Avail Available(7, EAX, 5) // intel Advanced Matrix Calculations #define amxbf16Avail Available(7, EDX, 22) #define amxtileAvail Available(7, EDX, 24) #define amxint8Avail Available(7, EDX, 25) /* PRINTF("blatt %d: %u %u %u %u\n", Blatt, s[0], s[1], s[2], s[3]); \ uint32_t a = s[Register];\ for (int i=31; i>=0; i--){if (i == Bit) PRINTF(" :");PRINTF("%s", (a >> i) & 1 ? "1" : "0");if (i%4 == 0) PRINTF(" ");} PRINTF(" register=%d bit=%d %d: %d %d\n", Register, Bit, bit_SSE, s[Register] & (1 << (Bit)), (s[Register] >> Bit) & 1); \ */ #define AVAILABLE_SIMD_OK static inline bool \ Available(unsigned VARIABLE_IS_NOT_USED B, int VARIABLE_IS_NOT_USED R, \ int VARIABLE_IS_NOT_USED Bit) { return true; } #if defined EXX_REDEFINED // unknown system -- don't perform checks #define INSTALL_DEFAULT Inone #define AVAILABLE_SIMD AVAILABLE_SIMD_OK #elif defined ARM32 #define INSTALL_DEFAULT Iask #if defined CROSS_CAPACITY #error "ARM allows only CROSS=noflags and CROSS=FALSE" #elif defined REQUIRED_SIMD && REQUIRED_SIMD <= 2 #error "ARM allows CROSS=noflags and CROSS=FALSE, only." #endif #define AVAILABLE_SIMD AVAILABLE_SIMD_OK #elif defined __APPLE__ // i.e. apple but isn't arm #define INSTALL_DEFAULT Inone #if defined CROSS_CAPACITY #error "old MAC-OS allows only CROSS=noflags and CROSS=FALSE" #elif defined REQUIRED_SIMD && REQUIRED_SIMD != 3 #error "old MAC-OS allows CROSS=noflags and CROSS=FALSE, only." #endif #if defined REQUIRED_SIMD #undef REQUIRED_SIMD #endif #define AVAILABLE_SIMD AVAILABLE_SIMD_OK #elif defined WINCPUID #define INSTALL_DEFAULT Iask #define AVAILABLE_SIMD static inline bool \ Available(unsigned Blatt, int Register, int Bit) { \ uint32_t s[4]; \ __cpuid((int *)s, (int) Blatt); \ return s[Register] & (1 << (Bit)); \ } } #if ! defined MSDOS_WINDOWS #error Puzzled about the underlying system. Please contact maintainer. #endif #elif defined LINUXCPUID #define INSTALL_DEFAULT Iask #define AVAILABLE_SIMD static inline bool \ Available(unsigned Blatt, int Register, int Bit) { \ uint32_t s[4]; \ asm volatile \ ("cpuid": "=a"(s[0]), "=b"(s[1]),"=c"(s[2]), \ "=d"(s[3]):"a"(Blatt),"c"(0)); \ return s[Register] & (1 << (Bit)); \ } #elif defined MINGWCPUID #define INSTALL_DEFAULT Iask // vgl https://github.com/luzexi/MinGW/blob/master/x64/lib/gcc/x86_64-w64-mingw32/4.8.0/include/cpuid.h #if defined SCHLATHERS_MACHINE #define REACT_ON_DIFFERENT_CPUID_RESULTS \ uint32_t u[4]; \ asm volatile \ ("cpuid": "=a"(u[0]), "=b"(u[1]),"=c"(u[2]), \ "=d"(u[3]):"a"(Blatt),"c"(0)); \ PRINTF("%u %u %u %u\n%u %u %u %u\n%u %u %u %u\n", \ u[0],u[1],u[2],u[3], \ t[0],t[1],t[2],t[3], \ s[0],s[1],s[2],s[3]); \ if ((s[0] != t[0] || s[1] != t[1] || s[2] != t[2] || s[3] !=t[3])) BUG #else #define REACT_ON_DIFFERENT_CPUID_RESULTS return false #endif #define AVAILABLE_SIMD static inline bool \ Available(unsigned Blatt, int Register, int Bit) { \ unsigned int t[4]; \ if (!__get_cpuid(Blatt, t, t+1, t+2, t+3)) \ ERR1("unallowed cpuid access. %.80s", CONTACT); \ unsigned int s[4]; \ __cpuid(Blatt, s[0], s[1], s[2], s[3]); \ if ((s[0] != t[0] || s[1] != t[1] || s[2] != t[2] || s[3] != t[3])) { \ /* __get_cpuid does not seem to work for certain registers */ \ /* indeed results may differ (14 Jan 2022) ! */ \ REACT_ON_DIFFERENT_CPUID_RESULTS; } \ return s[Register] & (1 << (Bit)); \ } #else #define INSTALL_DEFAULT Inone #define AVAILABLE_SIMD static inline bool \ Available(unsigned VARIABLE_IS_NOT_USED B, int VARIABLE_IS_NOT_USED R, \ int VARIABLE_IS_NOT_USED Bit) { \ RFERROR("SIMD checks are not available on your system (on MS systems only under Visual Studio). Use 'CROSS' on Linux systems and alike."); \ return false; \ } #if defined REQUIRED_SIMD #undef REQUIRED_SIMD #endif #endif #if defined CROSS_CAPACITY #if defined REQUIRED_SIMD #define ASSERT_TEXT \ "But the CPU doesn't know about it. As 'CROSS=TRUE' has been chosen as compilation option, it was assumed that the programme was compiled on the most unskilled CPU." // ok #else #define ASSERT_TEXT \ "But the CPU doesn't know about it. As 'CROSS' has been chosen as compilation option, it was assumed that each CPU has at least the CROSS skills." #endif #elif defined REQUIRED_SIMD // ! CROSS_CAPACITY #if REQUIRED_SIMD == 0 // CROSS = nosimd without -mno-sse2 #define ASSERT_TEXT \ "This means 'without SIMD', but the compiler includes SIMD. ('CROSS=nosimd' has been chosen.)" #elif REQUIRED_SIMD == 1 // CROSS = nosimd and -mno-sse2 #define ASSERT_TEXT\ "This means'without SIMD', but the CPU requires SIMD at a higher level. Please contact the maintainer." #elif REQUIRED_SIMD == 2 // CROSS=NA #define ASSERT_TEXT \ "This means 'without SIMD'), but the compiler includes SIMD (at a higher level). ('CROSS=NA' had been chosen.)" #elif REQUIRED_SIMD == 3 // CROSS=F ALSE #if defined AVAILABLE_SIMD #undef AVAILABLE_SIMD #endif #define AVAILABLE_SIMD AVAILABLE_SIMD_OK #define ASSERT_TEXT\ "This situation is unexpected for a PC. Please contact the maintainer." #elif REQUIRED_SIMD == 4 #define ASSERT_TEXT\ "This situation is unexpected on ARM. Please contact the maintainer." #else #define ASSERT_TEXT\ "This leads to an unexpected situation. Please contact the maintainer." #endif #else // ! CROSS_CAPACITY && ! REQUIRED_SIMD #define ASSERT_TEXT\ "This situation is unexpected for a server. Please contact the maintainer." #if defined AVAILABLE_SIMD #undef AVAILABLE_SIMD #endif #define AVAILABLE_SIMD AVAILABLE_SIMD_OK #endif #define ASSERT_AVAILABILITY(V,W) if ((V##Avail)) {} else {char msg[300]; SPRINTF(msg, "The program was compiled for '%.10s%.5s%.10s. %.200s'", #V, STRCMP(#V, #W) ? " && " : "", STRCMP(#V, #W) ? #W : "", ASSERT_TEXT); RFERROR(msg);} #define ASSERT_AVAILABILITY_AUX(V,W) ASSERT_AVAILABILITY(V,W) // expands V #define ASSERT_SIMD(FILE, WHAT) \ AVAILABLE_SIMD \ Uint check_simd_##FILE() { \ ASSERT_AVAILABILITY_AUX(SIMD_AVAILABILITY,WHAT); return noMISS;}\ Uint simd_use_##FILE = WHAT##USE; \ Uint simd_miss_##FILE = WHAT##MISS #define ASSERT_SIMD_AUX(FILE, WHAT) ASSERT_SIMD(FILE, WHAT)// expands WHAT #define THIS_FILE_ANYSIMD ASSERT_SIMD_AUX(this_file, SIMD_AVAILABILITY) #define SIMD_MISS(FILE, WHAT) \ Uint check_simd_##FILE() { return 1< #include "options.h" #include "xport_import.h" #include "win_linux_aux.h" #include "zzz_RandomFieldsUtils.h" #include "RandomFieldsUtils.h" #include "extern.h" KEY_type *PIDKEY[PIDMODULUS]; void KEY_type_NULL(KEY_type *KT) { // ACHTUNG!! setzt nur die uninteressanten zurueck. Hier also gar ncihts. KT->next = NULL; // braucht es eigentlich nicht KT->doshow = true; KT->ToIntDummy = NULL; KT->ToIntN = 0; KT->ToRealDummy = NULL; KT->ToRealN = 0; KT->nu2old = KT->nuOld = KT->nu1old = KT->nuAlt = -RF_INF; // option_type_NULL(KT, false) } void KEY_type_DELETE(KEY_type **S) { KEY_type *KT = *S; //option_type_DELETE(KT); FREE(KT->ToIntDummy); FREE(KT->ToRealDummy); UNCONDFREE(*S); } KEY_type *KEYT() { int mypid; pid(&mypid); // printf("entering KEYT %d %d \n", mypid, parentpid); // for (int i=0; ivisitingpid = mypid; if (PIDKEY[mypid % PIDMODULUS] != neu) { // another process had the // same idea FREE(neu); return KEYT(); // ... and try again } neu->pid = mypid; // printf("neu %d %d\n", mypid); neu->visitingpid = 0; neu->ok = true; if (PIDKEY[mypid % PIDMODULUS] != neu) BUG; KEY_type_NULL(neu); //if (basic.warn_parallel && mypid == parentpid) PRINTF("Do not forget to run 'RFoptions(storing=FALSE)' after each call of a parallel command (e.g. from packages 'parallel') that calls a function in 'RandomFields'. (OMP within RandomFields is not affected.) This message can be suppressed by 'RFoptions(warn_parallel=FALSE)'.") /*// OK */ return neu; } while (p->pid != mypid && p->next != NULL) { // printf("pp = %d\n", p->pid); p = p->next; } // printf("pp m = %d %d\n", p->pid, mypid); if (p->pid != mypid) { if (!p->ok || p->visitingpid != 0) { if (PL >= PL_ERRORS) { PRINTF("pid collision %d %d\n", p->ok, p->visitingpid); } // BUG; return KEYT(); } p->visitingpid = mypid; p->ok = false; if (p->visitingpid != mypid || p->ok) { return KEYT(); } KEY_type *neu = (KEY_type *) XCALLOC(1, sizeof(KEY_type)); neu->pid = mypid; if (!p->ok && p->visitingpid == mypid) { p->next = neu; p->visitingpid = 0; p->ok = true; return neu; } FREE(neu); p->visitingpid = 0; p->ok = true; KEY_type_NULL(neu); return KEYT(); } return p; } void setoptions(int i, int j, SEXP el, char name[LEN_OPTIONNAME], bool isList, bool local); void getoptions(SEXP sublist, int i, bool local); void deloptions(bool VARIABLE_IS_NOT_USED local) { #ifdef DO_PARALLEL if (local) RFERROR("'pivot_idx' cannot be freed on a local level"); #endif utilsoption_type *options = WhichOptionList(local); FREE(options->solve.pivot_idx); } THIS_FILE_ANYSIMD; EXTERN_SIMD_CHECK(avx2_fctns); EXTERN_SIMD_CHECK(avx_fctns); EXTERN_SIMD_CHECK(solve_61); void loadoptions() { if (!sseAvail) RFERROR("programm does not run on machines that old (not having sse)\n"); CHECK_THIS_FILE_ANYSIMD; CHECK_FILE(avx_fctns); CHECK_FILE(avx2_fctns); CHECK_FILE(solve_61); MEMSET(PIDKEY, 0, PIDMODULUS * sizeof(KEY_type *)); pid(&parentpid); attachRFUoptions((char *) "RandomFieldsUtils", prefixlist, prefixN, allOptions, allOptionsN, setoptions, getoptions, NULL, // final deloptions, NULL, NULL, 0, true, GPU_NEEDS, // from configure.ac SIMD_INFO, RFU_VERSION, RFU_VERSION, MEMisALIGNED); KEY_type *KT = KEYT(); union { unsigned short a; unsigned char b[2]; } ab; ab.a = 0xFF00; KT->global_utils.basic.bigendian = ab.b[0] != 0; //finalizeoptions(); SetLaMode(); } utilsoption_type *WhichOptionList(bool local) { if (local) { KEY_type *KT = KEYT(); if (KT == NULL) BUG; return &(KT->global_utils); } return &OPTIONS; } void PIDKEY_DELETE() { for (int kn=0; knnext; KEY_type_DELETE(&q); } PIDKEY[kn] = NULL; } } void detachoptions(){ PIDKEY_DELETE(); detachRFUoptions(prefixlist, prefixN); } RandomFieldsUtils/src/avx_fctns.cc0000644000176200001440000002640214227157055016747 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Collection of system specific auxiliary functions Copyright (C) 2001 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURSE. See the GNU General Public License for more details. g You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "Basic_utils_local.h" #include #include #include "RandomFieldsUtils.h" #include "kleinkram.h" #include "options.h" #include "Utils.h" #include "xport_import.h" #include "extern.h" #if defined AVX ASSERT_SIMD(avx_fctns, avx); #define algn_general(X) ((1U + (uintptr_t) (((uintptr_t) X - 1U) / BytesPerBlock)) * BytesPerBlock) double static inline *algn(double *X) { assert(algn_general(X)>=(uintptr_t)X); return (double *) algn_general(X); } void colMaxsI256(double *M, Long r, Long c, double *ans) { if (r < 16 #if defined AVX || !avxAvail #elif defined SSE2 || !sse2Avail #endif ) { for (Long i=0; i= atonce) { __m256d SET_0(1), SET_0(2), SET_0(3), SET_0(4), SET_0(5), SET_0(6),SET_0(7); for (; i < lenM; i += atonce) { ADDF(0); ADDF(1); ADDF(2); ADDF(3); ADDF(4); ADDF(5); ADDF(6); ADDF(7); } SUMUP(0, 1); SUMUP(2, 3); SUMUP(4, 5); SUMUP(6, 7); SUMUP(0, 2); SUMUP(4, 6); SUMUP(0, 4); } lenM = len - vectorlen + 1; for (; i < lenM; i += vectorlen) { ADDF(0); } double sum = D[0] + D[1] + D[2] + D[3]; for (; i < len; i++) sum += x[i] * y[i]; return sum; } #endif */ double avx_scalarprodDnearfma(double * x, double * y, Long len) { // deutlich genauer zum 0 tarif Long i = 0, lenM = len - (atonce - 1); __m256d SET_0(0), SET_0(1), SET_0(2), SET_0(3), SET_0(4), SET_0(5), SET_0(6),SET_0(7), P_0(0), P_0(1), P_0(2), P_0(3), P_0(4), P_0(5), P_0(6), P_0(7); double *D = (double *) &sum0; if ( len >= atonce) { for (; i < lenM; i += atonce) { ADDN(0); ADDN(1); ADDN(2); ADDN(3); ADDN(4); ADDN(5); ADDN(6); ADDN(7); } SUMUP(0, 1); SUMUP(2, 3); SUMUP(4, 5); SUMUP(6, 7); SUMUP(0, 2); SUMUP(4, 6); SUMUP(0, 4); } lenM = len - vectorlen + 1; for (; i < lenM; i += vectorlen) { ADDN(0); } double sum = D[0] + D[1] + D[2] + D[3]; for (; i < len; i++) sum += x[i] * y[i]; return sum; } #define ADDM(NR) \ prod0 = MULTDOUBLE(LOADuDOUBLE(x + i + NR * vectorlen), \ LOADuDOUBLE(y + i + NR * vectorlen)); \ sum0 = ADDDOUBLE(sum0, prod0) double avx_scalarprodD(double * x, double * y, Long len){ Long i = 0, lenM = len - (atonce - 1); __m256d SET_0(0), P_0(0); double *D = (double *) &sum0; if ( len >= atonce) { for (; i < lenM; i += atonce) { ADDM(0); ADDM(1); ADDM(2); ADDM(3); ADDM(4); ADDM(5); ADDM(6); ADDM(7); } } lenM = len - vectorlen + 1; for (; i < lenM; i += vectorlen) { ADDM(0); } double sum = D[0] + D[1] + D[2] + D[3]; for (; i < len; i++) sum += x[i] * y[i]; return sum; } /* // #pragma clang optimize on|off. //double avx_scalarprodDopt(double * x, double * y, Long len) __attribute__ ((optimize(3))); // #pragma GCC push_options #pragma GCC optimize ("Os") // aggressive // aggressive double avx_scalarprodDopt(double * x, double * y, Long len) { Long i = 0, lenM = len - (atonce - 1); __m256d SET_0(0), P_0(0); double *D = (double *) &sum0; if ( len >= atonce) { for (; i < lenM; i += atonce) { ADDM(0); ADDM(1); ADDM(2); ADDM(3); ADDM(4); ADDM(5); ADDM(6); ADDM(7); } } lenM = len - vectorlen + 1; for (; i < lenM; i += vectorlen) { ADDM(0); } double sum = D[0] + D[1] + D[2] + D[3]; for (; i < len; i++) sum += x[i] * y[i]; return sum; } #pragma GCC pop_options #define ADDMM(NR) \ x0 = LOADuDOUBLE(X0 + i + NR * vectorlen); \ y0 = LOADuDOUBLE(Y0 + i + NR * vectorlen); \ prod0 = MULTDOUBLE(x0, y0); \ sum0 = ADDDOUBLE(sum0, prod0); \ x1 = LOADuDOUBLE(X1 + i + NR * vectorlen); \ prod0 = MULTDOUBLE(x1, y0); \ sum1 = ADDDOUBLE(sum1, prod0); \ y1 = LOADuDOUBLE(Y1 + i + NR * vectorlen); \ prod0 = MULTDOUBLE(x0, y1); \ sum2 = ADDDOUBLE(sum2, prod0); \ prod0 = MULTDOUBLE(x1, y1); \ sum3 = ADDDOUBLE(sum3, prod0); \ x2 = LOADuDOUBLE(X2 + i + NR * vectorlen); \ prod0 = MULTDOUBLE(x2, y0); \ sum4 = ADDDOUBLE(sum4, prod0); \ prod0 = MULTDOUBLE(x2, y1); \ sum5 = ADDDOUBLE(sum5, prod0); \ y0 = LOADuDOUBLE(Y2 + i + NR * vectorlen); \ prod0 = MULTDOUBLE(x0, y0); \ sum6 = ADDDOUBLE(sum6, prod0); \ prod0 = MULTDOUBLE(x1, y0); \ sum7 = ADDDOUBLE(sum7, prod0); \ prod0 = MULTDOUBLE(x2, y0); \ sum8 = ADDDOUBLE(sum8, prod0); \ #pragma GCC optimize ("O1") // aggressive void avx_scalarprodM(double * X0, double * Y0, Long len, double *res) { Long i = 0, lenM = len - (atonce - 1); __m256d SET_0(0), SET_0(1), SET_0(2), SET_0(3), SET_0(4), SET_0(5), SET_0(6), SET_0(7), SET_0(8), x0, x1, x2, y0, y1,// y2, P_0(0); double *X1 = X0 + len, *Y1 = Y0 + len, *X2 = X0 + 2 * len, *Y2 = Y0 + 2 * len; if ( len >= atonce) { for (; i < lenM; i += atonce) { ADDMM(0); ADDMM(1); ADDMM(2); ADDMM(3); ADDMM(4); ADDMM(5); ADDMM(6); ADDMM(7); } } lenM = len - vectorlen + 1; for (; i < lenM; i += vectorlen) { ADDMM(0); } double *D = (double *) &sum0, sum = D[0] + D[1] + D[2] + D[3]; for (; i < len; i++) sum += X0[i] * Y0[i]; res[0] = sum; } */ double avx_scalarprodDP(double * x, double * y, Long len) { Long i = 0, lenM = len - (atonce - 1); __m256d SET_0(0), SET_0(1), P_0(0); double *D = (double *) &sum1; if ( len >= atonce) { for (; i < lenM; ) { Long lenMM = i + vectorlen * (repet * 10L + 1L); if (lenMM > lenM) lenMM = lenM; sum0 = MULTDOUBLE(LOADuDOUBLE(x + i), LOADuDOUBLE(y + i)); i += vectorlen; for (; i < lenMM; i += atonce) { ADDM(0); ADDM(1); ADDM(2); ADDM(3); ADDM(4); ADDM(5); ADDM(6); ADDM(7); } sum1 = ADDDOUBLE(sum0, sum1); } } lenM = len - vectorlen + 1; for (; i < lenM; i += vectorlen) { prod0 = MULTDOUBLE(LOADuDOUBLE(x + i), LOADuDOUBLE(y + i)); sum1 = ADDDOUBLE(sum1, prod0); } double sum = D[0] + D[1] + D[2] + D[3]; for (; i < len; i++) sum += x[i] * y[i]; return sum; } #define ADDK(NR) \ prod0 = MULTDOUBLE(LOADuDOUBLE(x + i + NR * vectorlen), \ LOADuDOUBLE(y + i + NR * vectorlen)); \ sum2 = SUBDOUBLE(prod0, sum1); \ sum3 = ADDDOUBLE(sum0, sum2); \ sum1 = SUBDOUBLE(sum3, sum0); \ sum0 = sum3; \ sum1 = SUBDOUBLE(sum1, sum2); double avx_scalarprodDK(double * x, double * y, Long len) { // Kahan Long i = 0, lenM = len - (atonce - 1); __m256d SET_0(0), // sum SET_0(1), SET_0(2), // y SET_0(3), // t P_0(0), P_0(1); double *D = (double *) &sum0; if ( len >= atonce) { for (; i < lenM; i += atonce) { ADDK(0); ADDK(1); ADDK(2); ADDK(3); ADDK(4); ADDK(5); ADDK(6); ADDK(7); } } lenM = len - vectorlen + 1; for (; i < lenM; i += vectorlen) { ADDK(0); } sum0 = ADDDOUBLE(sum0, prod1); double sum = D[0] + D[1] + D[2] + D[3]; for (; i < len; i++) sum += x[i] * y[i]; return sum; } #else void colMaxsI(double *M, Long r, Long c, double *ans); void colMaxsI256(double *M, Long r, Long c, double *ans) {colMaxsI(M, r, c, ans);} void linearprod2by2(double * x, double y, Long len, double *inout); void avx_linearprodD(double * x, double y, Long len, double *inout) { linearprod2by2(x, y, len, inout);} double scalarprod4by4( double * v1, double * v2, Long N); double avx_scalarprodDnearfma(double * x, double * y, Long L) { return scalarprod4by4(x,y,L);} double avx_scalarprodD(double * x, double * y, Long L) { return scalarprod4by4(x,y,L);} double avx_scalarprodDP(double * x, double * y, Long L) { return scalarprod4by4(x,y,L);} double avx_scalarprodDK(double * x, double * y, Long L) { return scalarprod4by4(x,y,L);} SIMD_MISS(avx_fctns, avx); #endif RandomFieldsUtils/src/Basic_utils.h0000644000176200001440000003024414227157055017056 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2021 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef basic_rfutils_h #define basic_rfutils_h 1 // RFU 3: vor Basic kein inttypes.h ( _INTTYPES_H ) oder aber RFdef_H==1; // miraculix 0: vor Basic inttypes.h; Ausnahme initNerror.cc kleinkram.cc thuy.cc xport_import.cc zzz.cc // adoption 0: vor Basic kein inttypes.h #define RFU_VERSION 12 #if defined RFU_LOCAL || defined RF_VERSION || defined MIRACULIX_VERSION || defined ADOPTION_VERSION || defined HAPLOBLOCKER_VERSION #if defined OBSOLETE_RFU #undef OBSOLETE_RFU #endif #else #if defined _INTTYPES_H #define obsolete_miraculix 1 #endif #ifndef OBSOLETE_RFU #define OBSOLETE_RFU 1 #endif #endif #define F77call F77_CALL // rename to control that USE_FC_LEN_T has been called #ifdef __cplusplus #define F77name extern "C" void F77_NAME // rename to control that USE_FC_LEN_T has been called #else #define F77name void F77_NAME #endif #define F77dgesdd F77call(dgesdd) #define F77dgemv F77call(dgemv) #define F77ddot F77call(ddot) #define F77dsyrk F77call(dsyrk) #ifndef __cplusplus #include #endif #include "def.h" #include #include #include #include #include "AutoRandomFieldsUtils.h" #define RFERROR error #define RFERROR1(M,A) {errorstring_type E_AUX; \ SPRINTF(E_AUX, M, A); RFERROR(E_AUX);} #define RFERROR2(M,A,B) {errorstring_type E_AUX; \ SPRINTF(E_AUX, M, A,B); RFERROR(E_AUX);} #define RFERROR3(M,A,B,C) {errorstring_type E_AUX;\ SPRINTF(E_AUX, M, A,B,C); RFERROR(E_AUX);} #define RFERROR4(M,A,B,C,D) {errorstring_type E_AUX; \ SPRINTF(E_AUX, M, A,B,C,D); RFERROR(E_AUX);} #define RFERROR5(M,A,B,C,D,E) {errorstring_type E_AUX; \ SPRINTF(E_AUX, M, A,B,C,D,E); RFERROR(E_AUX);} #define RFERROR6(M,A,B,C,D,E,F) {errorstring_type E_AUX;\ SPRINTF(E_AUX, M, A,B,C,D,E,F); RFERROR(E_AUX);} #define RFERROR7(M,A,B,C,D,E,F,G) {errorstring_type E_AUX;\ SPRINTF(E_AUX, M, A,B,C,D,E,F,G); RFERROR(E_AUX);} #define MULTIMINSIZE(S) ((S) > 20)// in omp parallel in DO_PARALLEL // #define MULTIMINSIZE(S) false // #define MULTIMINSIZE(S) true typedef char name_type[][MAXCHAR]; typedef enum usr_bool { // NOTE: if more options are included, change ExtendedBoolean in // userinterface.cc of RandomFields False=false, True=true, //Exception=2, // for internal use only Nan=INT_MIN } usr_bool; #define RF_NA NA_REAL #define RF_NAN R_NaN #define RF_NEGINF R_NegInf #define RF_INF R_PosInf #define T_PI M_2_PI #define OBSOLETENAME "obsolete" #define MAXINT 2147483647 #define MININT -2147483647 #define MAXUNSIGNED (MAXINT * 2) + 1 #define INFDIM MAXINT #define INFTY INFDIM #define PIDMODULUS 1000 #define LENGTH length // to avoid the unvoluntiered use of LENGTH defined by R #define complex Rcomplex #define DOT "." #define GAUSS_RANDOM(SIGMA) rnorm(0.0, SIGMA) #define UNIFORM_RANDOM unif_rand() #define POISSON_RANDOM(x) rpois(x) #define SQRT2 M_SQRT2 #define SQRTPI M_SQRT_PI #define INVPI M_1_PI #define PIHALF M_PI_2 #define ONETHIRD 0.333333333333333333333333 #define TWOTHIRD 0.6666666666666666666666667 #define TWOPI 6.283185307179586476925286766559 #define INVLOG2 1.442695040888963 #define INVSQRTTWO 0.70710678118654752440084436210 #define INVSQRTTWOPI 0.39894228040143270286 #define SQRTTWOPI 2.5066282746310002416 #define SQRTINVLOG005 0.5777613700268771079749 //#define LOG05 -0.69314718055994528623 #define LOG3 1.0986122886681096913952452369225257046474905578227 #define LOG2 M_LN2 #define EULER_C 0.5772156649015328606065120900824024310421 #define EPSILON 0.00000000001 #define EPSILON1000 0.000000001 #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MAX(A,B) ((A) > (B) ? (A) : (B)) #define ACOS std::acos #define ASIN std::asin #define ATAN std::atan #define FMIN fmin2 #define FMAX fmax2 #define ATANH std::atanh #define ACOSH std::acosh #define ASINH std::asinh #define EXPM1 std::expm1 #define LOG1P std::log1p #define FROUND fround #define COS std::cos #define EXP std::exp #define FABS(X) std::fabs((double) X) // OK; keine Klammern um X! #if ! defined MALLOCX #define MALLOCX std::malloc #define FLOOR std::floor #define SQRT(X) std::sqrt((double) X) // OK #define CEIL(X) std::ceil((double) X) // OK; keine Klammern um X! #define FREEX std::free #endif #define LOG std::log #define POW(X, Y) R_pow((double) X, (double) Y) // OK; keine Klammern um X! #define SIGN(X) sign((double) X) // OK #define SIN std::sin #define STRCMP(A, B) std::strcmp(A, B) // OK #define STRCPY(A, B) std::strcpy(A, B) // OK #define STRLEN std::strlen #define STRNCMP(A, B, C) std::strncmp(A, B, C) // OK #define STRNCPY(A, B, N) strcopyN(A, B, N) // OK #define TAN std::tan #define MEMCOPYX std::memcpy #define MEMMOVE std::memmove #define MEMSET std::memset #define MEMCMP std::memcmp #define AALLOC std::aligned_alloc #define CALLOCX std::calloc #define SPRINTF std::sprintf // Rprint #define ROUND(X) ownround((double) X) // OK #define TRUNC(X) ftrunc((double) X) // OK; keine Klammern um X! #define QSORT std::qsort #define print NEVER_USE_print_or_PRINTF_WITHIN_PARALLEL /* // */ #if defined SCHLATHERS_MACHINE && defined DO_PARALLEL && defined OMP_H #define PRINTF if (omp_get_num_threads() > 1) { error("\n\nnever use Rprintf/PRINTF within parallel constructions!!\n\n"); } else Rprintf // OK #else #define PRINTF Rprintf #endif #define R_PRINTLEVEL 1 #define C_PRINTLEVEL 1 #define MAXERRORSTRING 1000 typedef char errorstring_type[MAXERRORSTRING]; typedef unsigned int Uint; typedef uint64_t Ulong; typedef int64_t Long; // not SCHLATHERS_MACHINE #ifndef SCHLATHERS_MACHINE #define INTERNALMSG SERR0("Sorry. This functionality doesn't exist currently. There is work in progress at the moment by the maintainer.") #if ! defined assert #define assert(X) {} #endif #define BUG { \ RFERROR4("Severe error occured in function '%.50s' (file '%.50s', line %d).%.200s", \ __FUNCTION__, __FILE__, __LINE__, CONTACT); \ } //#define MEMCOPY(A,B,C) {MEMCPY(A,B,C); printf("memcpy %.50s %d\n", __FILE__, __LINE__);} #define MEMCOPY(A,B,C) MEMCOPYX(A,B,C) #define AMALLOC(ELEMENTS, SIZE) AALLOC(SIZE, (SIZE) * (ELEMENTS)) #if ! defined MALLOC #define MALLOC MALLOCX #define FREE(X) if ((X) == NULL) {} else {FREEX(X); (X)=NULL;} #endif #define CALLOC CALLOCX #define XCALLOC CALLOCX // #define UNCONDFREE(X) {FREEX(X); (X)=NULL;} #endif // not SCHLATHERS_MACHINE // SCHLATHERS_MACHINE #ifdef SCHLATHERS_MACHINE #define MAXALLOC 1000000000L // __extension__ unterdrueckt Fehlermeldung wegen geklammerter Argumente #define INTERNALMSG { \ RFERROR4("made to be an internal function '%.50s' ('%.50s', line %d).", \ __FUNCTION__, __FILE__, __LINE__); \ } #if ! defined assert #define assert(X) if (__extension__ (X)) {} else \ RFERROR4("'assert' failed in function '%.50s' (%.50s, line %d) %.200s.", \ __FUNCTION__, __FILE__, __LINE__, CONTACT) #endif #define SHOW_ADDRESSES 1 #define BUG { RFERROR3("BUG in '%.50s' of '%.50s' at line %d.\n", __FUNCTION__, __FILE__, __LINE__);} #define MEMCOPY(A,B,C) __extension__ ({ assert((A)!=NULL && (B)!=NULL && (C)>0 && (C)<=MAXALLOC); MEMCOPYX(A,B,C); }) //#define MEMCOPY(A,B,C) memory_copy(A, B, C) #define CALLOC(X, Y) __extension__({assert((X)>0 && (Y)>0 && ((X) * (Y))0 && (Y)>0 && ((X) * (Y))0 && (X)<=MAXALLOC); MALLOCX(X);}) #define FREE(X) if ((X) == NULL) {} else {if (!SHOWFREE) {} else PRINTF("free %.50s %ld Line %d %s\n", #X, (Long) X, __LINE__, __FILE__); FREEX(X); (X)=NULL;} #endif #define UNCONDFREE(X) { if (!SHOWFREE) {} else PRINTF("(free in %s, line %d)\n", __FILE__, __LINE__); FREEX(X); (X)=NULL;} #endif // SCHLATHERS_MACHINE #if defined SCHLATHER_DEBUGGING #undef MALLOC #undef CALLOC #undef XCALLOC #define MALLOC(X) __extension__({if (!DOPRINT) {} else PRINTF("(MLLC %s, line %d)\n", __FILE__, __LINE__);assert((X)>0 && (X)<=3e9); MALLOCX(X);}) #define CALLOC(X, Y) __extension__({if (!DOPRINT) {} else PRINTF("(CLLC %s, line %d)\n",__FILE__, __LINE__);assert((X)>0 && (Y)>0 && ((X) * (Y)) 0 && (Y)>0 && ((X) * (Y)) = 7 #define FALLTHROUGH_OK __attribute__ ((fallthrough)) #else #define FALLTHROUGH_OK #endif #define UTILSINFO(M) if (!KEYT()->global_utils.basic.helpinfo) {} else PRINTF("%s\n(Note that you can unable this information by 'RFoptions(helpinfo=FALSE)'.)\n", M) // OK #ifdef DO_PARALLEL #define HAS_PARALLEL true #else #define HAS_PARALLEL false #endif #ifdef USEGPU #define HAS_GPU true #else #define HAS_GPU false #endif #ifndef GPU_NEEDS // not a proper installation #define GPU_NEEDS Inone #endif #ifdef OBSOLETE_RFU #if defined SHOW_ADDRESSES #undef SHOW_ADDRESSES #endif #if ! defined RFU_NEED_OBSOLETE #undef FALLTHROUGH_OK #undef HAS_PARALLEL #endif extern int CORES; // from RF V4 on in extern.h: #define LENMSG MAXERRORSTRING #define LENERRMSG MAXERRORSTRING #define nErrorLoc MAXERRORSTRING typedef char errorloc_type[MAXERRORSTRING]; #define utilsparam utilsoption_type #define solve_param solve_options #if defined RFdef_H #define isGLOBAL NA_INTEGER #else #define isGLOBAL false #endif #ifdef _OPENMP #ifdef SCHLATHERS_MACHINE #define DO_PARALLEL 1 #else #define DO_PARALLEL 1 #endif #else #if defined DO_PARALLEL #undef DO_PARALLEL #endif #endif // #define LOCAL_MSG char MSG[LENERRMSG] #ifdef DO_PARALLEL #define LOCAL_ERRMSG2 char MSG2[LENERRMSG] #else // not DO_PARALLEL #define LOCAL_ERRMSG2 #endif //#if defined ERR //#undef ERR //#endif #ifndef ERR #define ERR ERR0 #endif #else // NOT OBSOLETE #if ! defined USE_FC_LEN_T #define USE_FC_LEN_T #endif #define ATAN2 std::atan2 #define COSH std::cosh #define SINH std::sinh #define TANH std::tanh #endif #if ! defined NA_LONG #define NA_LONG (-1L - (Long) 9223372036854775807) #endif #define FREE0(PT, WHICH) { \ FREE(PT->WHICH); PT->n_##WHICH= 0;} \ if (PT->WHICH != NULL) { \ UNCONDFREE(PT->WHICH); \ PT->n_##WHICH = 0; \ } else assert(PT->n_##WHICH==0); #endif RandomFieldsUtils/src/xport_import.h0000644000176200001440000000116614227157055017364 0ustar liggesusers #ifndef RandomFieldsUtilsxport_H #define RandomFieldsUtilsxport_H 1 typedef struct KEY_type KEY_type; struct KEY_type { KEY_type *next; utilsoption_type global_utils; int pid, visitingpid; bool ok, doshow; errorstring_type error_location; int *ToIntDummy; int ToIntN, ToRealN ; double *ToRealDummy; double loggamma1old, nu1old, loggamma2old, nu2old, loggamma_old,nuOld, gamma, nuAlt; }; extern KEY_type *PIDKEY[PIDMODULUS]; KEY_type *KEYT(); typedef struct option_type option_type; utilsoption_type *WhichOptionList(bool local); extern const char *R_TYPE_NAMES[LAST_R_TYPE_NAME + 1]; #endif RandomFieldsUtils/src/zzz_calls.h0000644000176200001440000002274114227157055016633 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef rfutils_calls_H #define rfutils_calls_H 1 /* in xport_import.cc of calling packages set #ifdefine ERROR_RFU_CALLS 1 #include "xport_import.h" ... // #define CALL(what) Ext_##what = (what##_type) R_GetCCallable(importfrom, #what) #define CALL(what) Ext_##what = #what_err; see RandomFields, for instance #in clude */ #ifdef ERROR_RFU_CALLS #define RFU_ERRCALL0(TYPE, FCTN) \ static TYPE FCTN##_err(){char msg[300]; SPRINTF(msg, "calling %.50s", #N); RFERROR(msg); } #define RFU_ERRCALL(TYPE, FCTN, ...) \ static TYPE FCTN##_err(__VA_ARGS__) { char msg[300]; SPRINTF(msg, "calling %.50s", #N); RFERROR(msg);} #else #define RFU_ERRCALL0(TYPE, FCTN) #define RFU_ERRCALL(TYPE, FCTN, ...) #endif #define DECLARE0(TYPE, FCTN) \ typedef TYPE (*FCTN##_type)(); \ attribute_hidden TYPE RU_##FCTN(); \ TYPE FCTN(); \ RFU_ERRCALL0(TYPE, FCTN) #define DECLARE1(TYPE, FCTN, A) \ typedef TYPE (*FCTN##_type)(A); \ attribute_hidden TYPE RU_##FCTN(A); \ TYPE FCTN(A); \ RFU_ERRCALL(TYPE, FCTN, A) #define DECLARE2(TYPE, FCTN, A, B) \ typedef TYPE (*FCTN##_type)(A, B); \ attribute_hidden TYPE RU_##FCTN(A, B); \ TYPE FCTN(A, B); \ RFU_ERRCALL(TYPE, FCTN, A, B) #define DECLARE3(TYPE, FCTN, A, B, C) \ typedef TYPE (*FCTN##_type)(A, B, C); \ attribute_hidden TYPE RU_##FCTN(A, B, C); \ TYPE FCTN(A, B, C);\ RFU_ERRCALL(TYPE, FCTN, A, B, C) #define DECLARE4(TYPE, FCTN, A, B, C, D) \ typedef TYPE (*FCTN##_type)(A, B, C, D); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D); \ TYPE FCTN(A, B, C, D); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D) #define DECLARE5(TYPE, FCTN, A, B, C, D, E) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E); \ TYPE FCTN(A, B, C, D, E); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E) #define DECLARE6(TYPE, FCTN, A, B, C, D, E, F) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F); \ TYPE FCTN(A, B, C, D, E, F); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F) #define DECLARE7(TYPE, FCTN, A, B, C, D, E, F, G) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G); \ TYPE FCTN(A, B, C, D, E, F, G); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G) #define DECLARE8(TYPE, FCTN, A, B, C, D, E, F, G, H) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H); \ TYPE FCTN(A, B, C, D, E, F, G, H); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H) #define DECLARE9(TYPE, FCTN, A, B, C, D, E, F, G, H, I) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H, I); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H, I); \ TYPE FCTN(A, B, C, D, E, F, G, H, I); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H, I) #define DECLARE10(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H, I, J); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H, I, J); \ TYPE FCTN(A, B, C, D, E, F, G, H, I, J); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J) #define DECLARE11(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H, I, J, K); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H, I, J, K); \ TYPE FCTN(A, B, C, D, E, F, G, H, I, J, K); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K) #define DECLARE12(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K, L) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H, I, J, K, L); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H, I, J, K, L); \ TYPE FCTN(A, B, C, D, E, F, G, H, I, J, K, L); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K, L) #define DECLARE13(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K, L, M) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H, I, J, K, L, M); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H, I, J, K, L, M); \ TYPE FCTN(A, B, C, D, E, F, G, H, I, J, K, L, M); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K, L, M) #define DECLARE14(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N) #define DECLARE15(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O) #define DECLARE16(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P) #define DECLARE17(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q) #define DECLARE18(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R) #define DECLARE19(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R, S) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R, S); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R, S); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R, S); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R, S) #define DECLARE20(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T) #define DECLARE21(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U) #define DECLARE22(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V) #define DECLARE23(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W) #define DECLARE24(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X) #define DECLARE25(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y) #define DECLARE26(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y, Z) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y, Z); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y, Z); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y, Z); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y, Z) #endif RandomFieldsUtils/src/AutoRandomFieldsUtils.h0000644000176200001440000000101714227157055021032 0ustar liggesusers #ifndef auto_rfutils_h #define auto_rfutils_h 1 #include "AutoRandomFieldsUtilsLocal.h" #define MAXUNITS 4 #define MAXCHAR 18 // max number of characters for (covariance) names #define RFOPTIONS "RFoptions" #define CLASS_TRYERROR "try-error" #define WARN_UNKNOWN_OPTION_ALL 4 #define WARN_UNKNOWN_OPTION_SINGLE 3 #define WARN_UNKNOWN_OPTION_CAPITAL 2 #define WARN_UNKNOWN_OPTION_NONE1 1 #define WARN_UNKNOWN_OPTION_NONE 0 #define CONTACT " Please contact the maintainer martin.schlather@math.uni-mannheim.de.\n" #endif RandomFieldsUtils/src/options.cc0000644000176200001440000004251014227157055016445 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2016 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include "parallel_simd.h" #ifdef TIME_AVAILABLE #include #endif #include "Basic_utils_local.h" // must be before anything else #include "kleinkram.h" #include "zzz_RandomFieldsUtils.h" #include "xport_import.h" #include "options.h" #include "extern.h" #define PLverbose 2 // IMPORTANT: all names of general must have at least 3 letters !!! const char *basic[basicN] = { "printlevel","cPrintlevel", "seed", "cores", "skipchecks", "asList", "verbose", "helpinfo", "efficient", "bigendian","warn_parallel" }; const char *installNrun[installNrunN] = { "kahanCorrection", "warn_unknown_option", "la_mode", "install","installPackages", "determineLAmode", "mem_is_aligned", "gpuDevices", "maxStreams" }; const char * solve[solveN] = { "use_spam", "spam_tol", "spam_min_p", "svdtol", "eigen2zero", "solve_method", "spam_min_n", "spam_sample_n", "spam_factor", "spam_pivot", "max_chol", "max_svd", "pivot", "pivot_idx", // dynamic parameter "pivot_relerror", "pivot_max_deviation", "pivot_max_reldeviation", "det_as_log", "pivot_actual_size", "pivot_check", "pseudoinverse" //, "tmp_delete" }; const char * prefixlist[prefixN] = {"basic", "installNrun", "solve"}; const char **allOptions[prefixN] = {basic, installNrun, solve}; int allOptionsN[prefixN] = {basicN, installNrunN, solveN}; utilsoption_type OPTIONS = { // OK basic_START, installNrun_START, { solve_START } }; //#if defined(unix) || defined(__unix__) || defined(__unix) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) int numCPU = sysconf(_SC_NPROCESSORS_ONLN); #else int numCPU = MAXINT; #endif int doPosDefIntern(double *M0, int size, bool posdef, double *rhs0, Long rhs_cols, double *result, double *logdet, int calculate, solve_storage *Pt, solve_options *sp, int VARIABLE_IS_NOT_USED cores); void setoptionsRFU(int i, int j, SEXP el, char name[LEN_OPTIONNAME], bool isList, utilsoption_type *options) { switch(i) { case 0: {// general basic_options *gp = &(options->basic); switch(j) { case 0: { // general options int threshold = 1000; //PL_ERRORS; gp->Rprintlevel = INT; if (gp->Rprintlevel > threshold) gp->Rprintlevel = threshold; PL = gp->Cprintlevel = gp->Rprintlevel + PLoffset; } break; case 1: PL = gp->Cprintlevel = INT + PLoffset ; break; case 2: gp->seed = Integer(el, name, 0, true); break; case 3: gp->cores = POSINT; if (gp->cores > numCPU) { WARN1("number of 'cores' is set to %d", numCPU); gp->cores = numCPU; } #ifdef DO_PARALLEL #else if (gp->cores != 1) { gp->cores = 1; PRINTF("The system does not allow for OpenMP: the value 1 is kept for 'cores'."); } #endif if (options->installNrun.determineLAmode) SetLaMode(); CORES = gp->cores; break; case 4: gp->skipchecks = LOGI; break; case 5: gp->asList = LOGI; break; case 6 : if (!isList) { PL = gp->Cprintlevel = gp->Rprintlevel = 1 + (LOGI * (PLverbose - 1)); } break; case 7: gp->helpinfo = LOGI; break; case 8 : gp->efficient = LOGI; break; case 9: break; // bigendian; read only case 10 : gp->warn_parallel = LOGI; break; default: BUG; }} break; case 1: { installNrun_options *gp = &(options->installNrun); switch(j) { case 0: gp->kahanCorrection = LOGI; break; case INSTALL_RUN_WARN_OPTION: gp->warn_unknown_option = INT; break; case 2: { int neu; if (TYPEOF(el) == STRSXP) { neu = GetName(el, name, LA_NAMES, LA_LAST + 1, gp->la_usr); //if (neu == LA_QUERY) { // if (!local) // OK // PRINTF("internal mode = '%.10s'\nmax size for internal Cholesky = %d\nmax size for tiny size algorithm = %d\n", LA_NAMES[gp->la_mode], gp->LaMaxTakeIntern, options->solve.tinysize); //} } else { neu = POS0INT; if (neu > LA_LAST) ERR0("wrong value for 'la_mode'"); } if (neu != LA_QUERY) { #ifdef USEGPU #else if (neu > LA_GPU) ERR1("In order to use '%.20s', install the package with apropriate compiler options.", LA_NAMES[LA_GPU]); #endif SetLaMode((la_modes) neu, options->basic.cores); // koennen noch fehler auftreten // printf("simu ende\n"); gp->la_usr = (la_modes) neu; } } break; case 3 : { install_modes old = gp->install; gp->install = (install_modes ) GetName(el, name, INSTALL_NAMES, INSTALL_LAST + 1, Iask); if (gp->install == Inone) gp->installPackages = false; else if (old != gp->install) { gp->installPackages = true; resetInstalled(); } } break; case 4 : // gp->installPackages != LOGI; break; case 5 : gp->determineLAmode = LOGI; break; case 6 : // gp->mem_is_aligned = LOGI; break; case 7 : Integer(el, name, gp->gpu_devices, MAX_GPU_DEVICES) ; gp-> Ngpu_devices = MIN(length(el), MAX_GPU_DEVICES); break; case 8 : gp->maxStreams = POS0INT; break; default: BUG; }} break; case 2: { // printf("name = %.50s %d\n", name, j); solve_options *so = &(options->solve); switch(j) { case 0: so->sparse = USRLOG; if (so->sparse != False) { so->sparse = False; ERR0("'spam' is currently disabled.") } break; // USRLOGRELAXED?? case 1: so->spam_tol = POS0NUM; break; case 2: Real(el, name, so->spam_min_p, 2); for (int u=0; u<=1; u++) so->spam_min_p[u] = so->spam_min_p[u] < 0 ? 0 : so->spam_min_p[u] > 1.0 ? 1.0 : so->spam_min_p[u]; break; case SOLVE_SVD_TOL: so->svd_tol = POS0NUM; break; case 4: so->eigen2zero = POS0NUM; break; case 5: GetName(el, name, InversionNames, nr_user_InversionMethods, (int) NoInversionMethod, (int) NoFurtherInversionMethod, (int *)so->Methods, SOLVE_METHODS); break; case 6: Integer(el, name, so->spam_min_n, 2); break; case 7: so->spam_sample_n = POSINT; break; case 8: so->spam_factor = POSINT; break; case 9: so->pivotsparse = POSINT; if (so->pivotsparse > 2) so->pivotsparse = PIVOT_NONE; break; case 10: // printf("max chol = %d\n", so->max_chol); so->max_chol = POSINT; // printf("X max chol = %d\n", so->max_chol); break; case 11: so->max_svd = POS0INT; break; // case 11: so->tmp_delete = LOGI; break; case 12: so->pivot_mode = (pivot_modes) GetName(el, name, PIVOT_NAMES, PIVOT_LAST + 1, so->pivot_mode); break; case 13: if (!isList) { int len = length(el); if (len == 0) { if (so->n_pivot_idx > 0) { FREE(so->pivot_idx); } } else { if (so->n_pivot_idx != len) { FREE(so->pivot_idx); so->pivot_idx = (int*) MALLOC(len * sizeof(int)); } for (int L=0; Lpivot_idx[L] = Integer(el, name, L); } so->n_pivot_idx = len; } break; case 14: so->pivot_relerror = POS0NUM; break; case 15: so->max_deviation = POSNUM; break; case 16: so->max_reldeviation = POS0NUM; break; case 17: so->det_as_log = LOGI; break; case 18: so->actual_size = POS0NUM; break; case 19: so->pivot_check = USRLOG; break; case 20: so->pseudoinverse = LOGI; break; default: BUG; }} break; default: BUG; } } void setoptions(int i, int j, SEXP el, char name[LEN_OPTIONNAME], bool isList, bool local) { if (!local && parallel()) ERR1("Option '%.25s' can be set only through 'RFoptions' at global level", allOptions[i][j]); setoptionsRFU(i, j, el, name, isList, WhichOptionList(local)); } void getoptionsRFU(SEXP sublist, int i, utilsoption_type *options) { int k = 0; //printf("OK %d\n", i); switch(i) { case 0 : { // printf("OK %d\n", i); basic_options *p = &(options->basic); ADD(ScalarInteger(p->Rprintlevel)); ADD(ScalarInteger(p->Cprintlevel - PLoffset)); ADD(ScalarInteger(p->seed)); ADD(ScalarInteger(p->cores)); ADD(ScalarLogical(p->skipchecks)); ADD(ScalarLogical(p->asList)); ADD(ScalarLogical(p->Rprintlevel >= PLverbose)); ADD(ScalarLogical(p->helpinfo)); ADD(ScalarLogical(p->efficient)); ADD(ScalarLogical(p->bigendian)); ADD(ScalarLogical(p->warn_parallel)); } break; case 1 : { installNrun_options *p = &(options->installNrun); ADD(ScalarLogical(p->kahanCorrection)); ADD(ScalarInteger(p->warn_unknown_option)); ADD(ScalarString(mkChar(LA_NAMES[p->la_usr]))); ADD(ScalarString(mkChar(INSTALL_NAMES[p->install]))); ADD(ScalarLogical(p->installPackages)); ADD(ScalarLogical(p->determineLAmode)); ADD(ScalarLogical(p->mem_is_aligned)); SET_VECTOR_ELT(sublist, k++, Int(p->gpu_devices, p->Ngpu_devices)); ADD(ScalarInteger(p->maxStreams)); } break; case 2 : { solve_options *p = &(options->solve); // printf("sparse user interface %d %d %d\n", p->sparse, NA_LOGICAL, NA_INTEGER); ADD(ExtendedBooleanUsr(p->sparse)); // ADD(ScalarReal(p->spam_tol)); SET_VECTOR_ELT(sublist, k++, Num(p->spam_min_p, 2)); ADD(ScalarReal(p->svd_tol)); ADD(ScalarReal(p->eigen2zero)); SET_VECTOR_ELT(sublist, k++, String((int*) p->Methods, InversionNames, SOLVE_METHODS, (int) NoFurtherInversionMethod)); // printf("A\n"); SET_VECTOR_ELT(sublist, k++, Int(p->spam_min_n, 2)); ADD(ScalarInteger(p->spam_sample_n)); ADD(ScalarInteger(p->spam_factor)); ADD(ScalarInteger(p->pivotsparse)); ADD(ScalarInteger(p->max_chol)); ADD(ScalarInteger(p->max_svd)); ADD(ScalarString(mkChar(PIVOT_NAMES[p->pivot_mode]))); //if (true) SET_VECTOR_ELT(sublist, k++, Int(p->pivot_idx, p->n_pivot_idx)); // else ADD(ScalarInteger(NA_INTEGER)); // ADD(ScalarLogical(p->tmp_delete)); ADD(ScalarReal(p->pivot_relerror)); ADD(ScalarReal(p->max_deviation)); ADD(ScalarReal(p->max_reldeviation)); ADD(ScalarLogical(p->det_as_log)); ADD(ScalarInteger(p->actual_size)); ADD(ExtendedBooleanUsr(p->pivot_check)); ADD(ScalarLogical(p->pseudoinverse)); } break; default : BUG; } // printf("EE A\n"); } void getoptions(SEXP sublist, int i, bool local) { getoptionsRFU(sublist, i, WhichOptionList(local)); } void params_utilsoption(int local, int *params) { utilsoption_type *from = &OPTIONS; if (local) { KEY_type *KT = KEYT(); from = &(KT->global_utils); } params[PIVOT_IDX_N] = from->solve.n_pivot_idx; } void get_utilsoption(utilsoption_type *S, int local) { assert(solveN == 21 && basicN == 9 && installNrunN == 10 && prefixN==3); utilsoption_type *from = &OPTIONS; if (local) { KEY_type *KT = KEYT(); from = &(KT->global_utils); } assert(from->solve.n_pivot_idx!=0 xor from->solve.pivot_idx == NULL); assert(S->solve.n_pivot_idx!=0 xor S->solve.pivot_idx == NULL); if (S->solve.n_pivot_idx != from->solve.n_pivot_idx) BUG; int *save_idx = S->solve.pivot_idx; MEMCOPY(S, from, sizeof(utilsoption_type)); // OK S->solve.pivot_idx = save_idx; if (S->solve.n_pivot_idx > 0) { MEMCOPY(S->solve.pivot_idx, from->solve.pivot_idx, sizeof(int) * S->solve.n_pivot_idx); } } void push_utilsoption(utilsoption_type *S, int local) { utilsoption_type *to = &OPTIONS; if (local) { KEY_type *KT = KEYT(); to = &(KT->global_utils); } assert(to->solve.n_pivot_idx!=0 xor to->solve.pivot_idx == NULL); assert(S->solve.n_pivot_idx!=0 xor S->solve.pivot_idx == NULL); int *save_idx = to->solve.pivot_idx; if (to->solve.n_pivot_idx != S->solve.n_pivot_idx) { FREE(to->solve.pivot_idx); to->solve.pivot_idx = (int*) MALLOC(S->solve.n_pivot_idx * sizeof(int)); save_idx = to->solve.pivot_idx; } MEMCOPY(to, S, sizeof(utilsoption_type)); // OK to->solve.pivot_idx = save_idx; if (S->solve.n_pivot_idx > 0) { MEMCOPY(to->solve.pivot_idx, S->solve.pivot_idx, sizeof(int) * S->solve.n_pivot_idx); } } void del_utilsoption(utilsoption_type *S) { FREE(S->solve.pivot_idx); S->solve.n_pivot_idx = 0; } extern bool obsolete_package_in_use; #define FASTER 1.3 // 1.3 as for typical application of likelihood, // the determinant calculation in RandomFieldsUtils is for free. Somehow a // balance int own_chol_up_to(int size, int maxtime, int VARIABLE_IS_NOT_USED cores) { #ifdef TIME_AVAILABLE if (size <= 0) return true; Long delta[2]; solve_options sp; solve_storage pt; solve_NULL(&pt); MEMCOPY(&sp, &(OPTIONS.solve), sizeof(solve_options)); sp.Methods[0] = Cholesky; sp.Methods[1] = NoFurtherInversionMethod; sp.pivot_mode = PIVOT_NONE; sp.sparse = False; double old_quotient = RF_NAN; // basic assumption is that R implementation's getting faster // for larger matrices // printf("**** start\n"); while (true) { // printf("x \n"); int sizeP1 = size + 1, sizesq = size * size, loops = size > 64 ? 1 : 16384 / ((size + 8) * (size+8)) / 4; double *M = (double*) MALLOC(sizesq * sizeof(double)); for (int j=0; j<=1; j++) { // printf("%d,%d\n", j, loops); SetLaMode(j == 0 || obsolete_package_in_use ? LA_INTERN : LA_R, cores); clock_t start = clock(); for (int k=0; k 1) M[1] = M[size] = 1e-5; //printf("size=%d\n", size); doPosDefIntern(M, size, true, NULL, 0, NULL, NULL, MATRIXSQRT, &pt, &sp, cores); //printf("doen\n"); } delta[j] = (Long) clock() - start; if (delta[j] < 0) delta[j] += MAXINT; // manual: 32bit repres. of clock } FREE(M); if (PL > 2) PRINTF("Cholesky decomposition for a %d x %d matrix needs %ld and %ld [mu s] on R and facile code on %d cores (#%d), respectively.\n", size, size, delta[1], delta[0], CORES, loops); // printf("delta %d %d %d\n", delta[0], delta[1], maxtime); if (delta[0] > maxtime || delta[1] > maxtime || delta[0] >= FASTER * delta[1]){ solve_DELETE0(&pt); if ((maxtime > 0 && (delta[0] > 10 * maxtime || delta[1] > 10 * maxtime)) || delta[0] > 2 * delta[1] || delta[1] > 2 * delta[0]) { // seems to be time consuming. So stop. return (double) delta[0] < FASTER * (double) delta[1] ? MAXINT : (size <= 0 ? 0 : size / 2); } break; } old_quotient = (double) delta[0] / delta[1]; size *= 2; } double new_quotient = (double) delta[0] / delta[1]; if (new_quotient < FASTER) return MAXINT; if (size <= 0) return(0); if (!R_FINITE(old_quotient)) { // printf("halfsize\n"); int compare = own_chol_up_to(size / 2, 0, cores); return compare == MAXINT ? size : compare; } double x0 = 0.5 * size * (1.0 + (FASTER - old_quotient) / (new_quotient - old_quotient)); //lin interpol assert(x0 >= 0.5 * size && x0 <= size); int compare = own_chol_up_to((int) x0, 0, cores); // printf("%f %f %f %f %d %d\n", x0,FASTER, old_quotient, new_quotient, size, compare); return (int) (compare == MAXINT ? x0 : 0.5 * size); #else ERR0("option 'LA_AUTO' is available only on linux systems"); return 0; #endif } int own_chol_up_to(int VARIABLE_IS_NOT_USED cores) { own_chol_up_to(256, 0, cores); //warm up for some BLAS implementatioan // CORES = GL OBAL.basic.cores = 4; // own_chol_up_to(256, 50000); // own_chol_up_to(8, 50000); return own_chol_up_to(256, 50000, cores); } void SetLaMode(la_modes usr_mode, int VARIABLE_IS_NOT_USED cores) { utilsoption_type *utils = &OPTIONS; la_modes la_mode = usr_mode; utils->solve.tinysize = utils->installNrun.LaMaxTakeIntern = -1; #define TINY_SIZE_MAX 3 if (la_mode == LA_INTERN) { utils->solve.tinysize = TINY_SIZE_MAX; utils->installNrun.LaMaxTakeIntern = MAXINT; } else if (la_mode == LA_AUTO) { la_mode = HAS_GPU ? LA_GPU : LA_R ; #if defined TIME_AVAILABLE # ifdef SCHLATHERS_MACHINE #else int PLalt = PL; PL = 0; # endif utils->installNrun.LaMaxTakeIntern = own_chol_up_to(cores); utils->solve.tinysize = MIN(TINY_SIZE_MAX, utils->installNrun.LaMaxTakeIntern); if (PL > 0) PRINTF("Limit size for facile Cholesky algorithm = %d\n", utils->installNrun.LaMaxTakeIntern); # ifdef SCHLATHERS_MACHINE #else PL = PLalt; # endif #endif } if ((la_mode == LA_GPU || la_mode == LA_R) && utils->solve.pivot_mode > PIVOT_AUTO) ERR0("Pivotized Cholesky decomposition has not been implemented yet for GPU and the LAPACK library"); utils->installNrun.la_mode = la_mode; } void SetLaMode() { utilsoption_type *utils = &OPTIONS; SetLaMode(utils->installNrun.la_usr, utils->basic.cores); } RandomFieldsUtils/src/beskf.cc0000644000176200001440000000400414227157055016040 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de main library for unconditional simulation of random fields Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /*gcc beskf.c -o [name] -lm -mavx*/ /*Modified Bessel Function*/ /*From R-3.6.1/src/nmath/b essel_k.c*/ //#include //#include //#include /*DBL_MIN*/ #include "Basic_utils_local.h" #include "errors_messages.h" #define xmax_BESS_K 705.342 /*From bessel.h*/ #define sqxmin_BESS_K 1.49e-154 /*From bessel.h*/ #define M_SQRT_2dPI 0.797884560802865355879892119869 /*From Rmath.h*/ #define min0(x, y) (((x) <= (y)) ? (x) : (y)) #define max0(x, y) (((x) <= (y)) ? (y) : (x)) /*void bes_k_simd (double *xv, double alpha, int sx, double *yv); int main (void) { int i, len = 1000000; double alpha = 1.1; double *x= m alloc(len *sizeof(double)), *yv = m alloc(len *sizeof(double)), timetaken; clock_t start, end; for (i = 0; i < len; i++){ x[i] = (double)rand()/RAND_MAX*5.0; yv[i] = 0.; } start = clock(); bes_k_simd(x,alpha,len,yv); end = clock(); timetaken = (double) (end - start); // printf("Time: %f\n",timetaken); f ree(x); f ree(yv); return 0; }*/ void bes_k_simd (double VARIABLE_IS_NOT_USED *xv, double VARIABLE_IS_NOT_USED Nu, int VARIABLE_IS_NOT_USED sx, double VARIABLE_IS_NOT_USED *yv) { ERR0("not yet programmed"); } RandomFieldsUtils/src/spamown.f0000644000176200001440000015533414227157055016307 0ustar liggesusersc c Authors: c Reinhard Furrer c c Copyright (C) 2017 -- 2017 Reinhard Furrer c c This program is free software; you can redistribute it and/or c modify it under the terms of the GNU General Public License c as published by the Free Software Foundation; either version 3 c of the License, or (at your option) any later version. c c This program is distributed in the hope that it will be useful, c but WITHOUT ANY WARRANTY; without even the implied warranty of c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the c GNU General Public License for more details. c c You should have received a copy of the GNU General Public License c along with this program; if not, write to the Free Software c Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. subroutine amuxmat (n,m,p, x, y, a,ja,ia) implicit none integer n, m, p, ja(*), ia(*) double precision x(m,p), y(n,p), a(*) c----------------------------------------------------------------------- c Multiplies a sparse matrix by a full matrix using consecutive dot c products, cf. subroutine amux from sparse kit. c Matrix A is stored in compressed sparse row storage. c c on entry: c---------- c n = row dimension of A c p = column dimension of x c x = array of dimension mxp, m column dimension of A. c a, ja, c ia = input matrix in compressed sparse row format. c c on return: c----------- c y = array of dimension nxp, containing the product y=Ax c c Reinhard Furrer c----------------------------------------------------------------------- c local variables c double precision t integer j, i, k c----------------------------------------------------------------------- do j = 1,p do i = 1,n c c compute the inner product of row i with vector x c t = 0.0d0 do k=ia(i), ia(i+1)-1 t = t + a(k)*x(ja(k),j) enddo c y(i,j) = t enddo enddo c return c---------end-of-amuxmat------------------------------------------------ c----------------------------------------------------------------------- end c subroutine notzero (ja,ia,nrow,ncol,nnz,nz,jao,iao) c Return the structure of the zero entries in ra,ja,ia, in c compressed sparse row format via rao, jao, iao. c INPUT: c ja, ia -- sparse structure of the matrix A c nrow -- number of rows in `a' c ncol -- number of columns in `a' c nnz -- number of non-zero elements c nz -- number of zero elements c OUTPUT: c jao, iao -- sparse structure of the zero entries c WORK ARRAY: c colmn -- logical vector of length ncol implicit none integer nrow,ncol,nnz,nz,inz, & ja(nnz),ia(nrow+1),jao(nz),iao(nrow+1) logical colmn(ncol) integer i,j,k inz = 0 iao(1) = 1 do i = 1,nrow iao(i+1) = iao(i) do k = 1,ncol colmn(k) = .true. enddo do j = ia(i),ia(i+1)-1 colmn(ja(j)) = .false. enddo do k = 1,ncol if(colmn(k)) then inz = inz + 1 jao(inz) = k iao(i+1) = iao(i+1) + 1 endif enddo enddo return end subroutine setdiagmat (nrow, n, a, ja, ia, diag, iw) implicit none integer nrow, n double precision a(*), diag(n) integer ja(*), ia(nrow+1), iw(nrow) c----------------------------------------------------------------------- c Sets the diagonal entries of a sparse matrix c----------------------------------------------------------------------- c on entry: c --------- c nrow = integer. The row dimension of A c n = integer. Smallest dimension of A c c a, ja, ia = Matrix A in compressed sparse row format. Sorted. c diag = diagonal matrix stored as a vector diag(1:n) c iw = n vector of zeros. c c on return: c---------- c updated matrix A c iw = iw contains the positions of the diagonal entries in the c output matrix. (i.e., a(iw(k)), ja(iw(k)), k=1,...n, c are the values/column indices of the diagonal elements c of the output matrix. ). c c Reinhard Furrer c----------------------------------------------------------------- logical insert integer i,j, k, k1, k2, icount c c get positions of diagonal elements in data structure. c do 11 i=1,n do 21 j= ia(i),ia(i+1)-1 if (ja(j) .ge. i) then if (ja(j) .eq. i) then iw(i) = j endif goto 11 endif 21 continue 11 continue c c count number of holes in diagonal and add diag(*) elements to c valid diagonal entries. c icount = 0 do 31 i=1, n if (iw(i) .eq. 0) then icount = icount+1 else a(iw(i)) = diag(i) endif 31 continue c c if no diagonal elements to insert return c if (icount .eq. 0) return c c shift the nonzero elements if needed, to allow for created c diagonal elements. c c c copy rows backward c do 5 i=nrow, 1, -1 c c go through row ii c k1 = ia(i) k2 = ia(i+1)-1 ia(i+1) = ia(i+1)+icount if ((i .gt. n) .or. (iw(i) .gt. 0)) then c iw(ii) equal to 0, means no diagonal element in a, we need to insert it c test is thus true. c no fill-in, only copying do 4 k = k2,k1,-1 ja(k+icount)=ja(k) a(k+icount)=a(k) 4 continue iw(i)=-i else insert=.TRUE. if (k2.lt.k1) then ja(k2+icount)=i a(k2+icount)=diag(i) iw(i)=k2+icount icount=icount-1 insert = .FALSE. if (icount .eq. 0) return else do 6 k = k2,k1,-1 if (ja(k).gt. i) then ja(k+icount)=ja(k) a(k+icount)=a(k) else if (insert) then ja(k+icount)=i a(k+icount)=diag(i) iw(i)=k+icount icount=icount-1 insert = .FALSE. if (icount .eq. 0) return endif if (ja(k).lt. i) then ja(k+icount)=ja(k) a(k+icount)=a(k) endif 6 continue c in case there is only one element, larger than i, we still need to c add the diagonal element if (insert) then ja(k+icount)=i a(k+icount)=diag(i) iw(i)=k+icount icount=icount-1 insert = .FALSE. if (icount .eq. 0) return endif endif endif 5 continue return c----------------------------------------------------------------------- c------------end-of-diagaddmat------------------------------------------ end subroutine diagaddmat (nrow, n, a, ja, ia, diag, iw) implicit none integer nrow, n double precision a(*), diag(n) integer ja(*), ia(nrow+1), iw(nrow) c----------------------------------------------------------------------- c Adds a diagonal matrix to a sparse matrix: A = Diag + A c----------------------------------------------------------------------- c on entry: c --------- c nrow = integer. The row dimension of A c n = integer. Smallest dimension of A c c a, ja, ia = Matrix A in compressed sparse row format. Sorted. c diag = diagonal matrix stored as a vector diag(1:n) c iw = n vector of zeros. c c on return: c---------- c updated matrix A c iw = iw contains the positions of the diagonal entries in the c output matrix. (i.e., a(iw(k)), ja(iw(k)), k=1,...n, c are the values/column indices of the diagonal elements c of the output matrix. ). c c Reinhard Furrer c----------------------------------------------------------------- logical insert integer i,j, k, k1, k2, icount c c get positions of diagonal elements in data structure. c do 11 i=1,n do 21 j= ia(i),ia(i+1)-1 if (ja(j) .ge. i) then if (ja(j) .eq. i) then iw(i) = j endif goto 11 endif 21 continue 11 continue c c count number of holes in diagonal and add diag(*) elements to c valid diagonal entries. c icount = 0 do 31 i=1, n if (iw(i) .eq. 0) then icount = icount+1 else a(iw(i)) = a(iw(i)) + diag(i) endif 31 continue c c if no diagonal elements to insert return c if (icount .eq. 0) return c c shift the nonzero elements if needed, to allow for created c diagonal elements. c c c copy rows backward c do 5 i=nrow, 1, -1 c c go through row ii c k1 = ia(i) k2 = ia(i+1)-1 ia(i+1) = ia(i+1)+icount if ((i .gt. n) .or. (iw(i) .gt. 0)) then c iw(ii) equal to 0, means no diagonal element in a, we need to insert it c test is thus true. c no fill-in, only copying do 4 k = k2,k1,-1 ja(k+icount)=ja(k) a(k+icount)=a(k) 4 continue iw(i)=-i else insert=.TRUE. if (k2.lt.k1) then ja(k2+icount)=i a(k2+icount)=diag(i) iw(i)=k2+icount icount=icount-1 insert = .FALSE. if (icount .eq. 0) return else do 6 k = k2,k1,-1 if (ja(k).gt. i) then ja(k+icount)=ja(k) a(k+icount)=a(k) else if (insert) then ja(k+icount)=i a(k+icount)=diag(i) iw(i)=k+icount icount=icount-1 insert = .FALSE. if (icount .eq. 0) return endif if (ja(k).lt. i) then ja(k+icount)=ja(k) a(k+icount)=a(k) endif 6 continue c in case there is only one element, larger than i, we still need to c add the diagonal element if (insert) then ja(k+icount)=i a(k+icount)=diag(i) iw(i)=k+icount icount=icount-1 insert = .FALSE. if (icount .eq. 0) return endif endif endif 5 continue return c----------------------------------------------------------------------- c------------end-of-setdiagmat------------------------------------------ end c----------------------------------------------------------------------- subroutine diagmua (nrow, a, ia, diag) implicit none integer nrow, ia(nrow+1) double precision a(*), diag(nrow), scal c----------------------------------------------------------------------- c performs the matrix by matrix product A = Diag * A (in place) c (diamua from sparsekit provides more functionality) c----------------------------------------------------------------------- c on entry: c --------- c nrow = integer. The row dimension of A c a, ia = Matrix A in compressed sparse row format. c (ja is not needed) c c diag = diagonal matrix stored as a vector diag(1:n) c c on return: c---------- c a, = resulting matrix A in compressed sparse row sparse format. c c Notes: c------- c Reinhard Furrer 2007-06-21 c c----------------------------------------------------------------- c local variables integer ii, k, k1, k2 do 1 ii=1,nrow c c normalize each row c k1 = ia(ii) k2 = ia(ii+1)-1 scal = diag(ii) do 2 k=k1, k2 a(k) = a(k)*scal 2 continue 1 continue c return c----------end-of-diagmua------------------------------------------------ c----------------------------------------------------------------------- end c----------------------------------------------------------------------- c----------------------------------------------------------------------- subroutine getdiag (a,ja,ia,len,diag) implicit none double precision diag(*),a(*) integer len, ia(*), ja(*) c----------------------------------------------------------------------- c This subroutine extracts the main diagonal. c (getdia from sparsekit provides more functionality) c----------------------------------------------------------------------- c c on entry: c---------- c c len= min(nrow, ncol) = min dimension of the matrix a. c a,ja,ia = matrix stored in sorted compressed sparse row a,ja,ia,format c diag = array of zeros. c c on return: c----------- c diag = array of length containing the wanted diagonal. c c Notes: c------- c Reinhard Furrer 2006-11-02 c----------------------------------------------------------------------c c local variables integer i, k c c extract diagonal elements c do 1 i=1, len do k= ia(i),ia(i+1) -1 if (ja(k) .ge. i) then c we are at or beyond the diagonal. if (ja(k) .eq. i) then diag(i)= a(k) endif goto 1 endif enddo 1 continue return c------------end-of-getdiag---------------------------------------------- c----------------------------------------------------------------------- end c Functions that are new or modified. subroutine subsparsefull(nrow,a,ja,ia,b) c c subtracts a sparse matrix from a full one c algorithm is in-place, i.e. b is changed c c c Notes: c------- c Reinhard Furrer 2006-09-21 c----------------------------------------------------------------------- implicit none integer nrow,ja(*),ia(nrow+1) double precision a(*), b(nrow,*) integer i,k do i=1,nrow do k=ia(i),ia(i+1)-1 b(i,ja(k)) = b(i,ja(k))-a(k) enddo enddo return end subroutine subfullsparse(nrow,ncol,a,ja,ia,b) c c subtracts a full matrix from a sparse one c algorithm is in-place, i.e. b is changed c c c Notes: c------- c Reinhard Furrer 2006-09-21 c----------------------------------------------------------------------- implicit none integer nrow,ncol,ja(*),ia(nrow+1) double precision a(*), b(nrow,*) integer i,j,k do i=1,nrow do j=1,ncol b(i,j) = -b(i,j) enddo do k=ia(i),ia(i+1)-1 b(i,ja(k)) = b(i,ja(k))+a(k) enddo enddo return end subroutine addsparsefull(nrow,a,ja,ia,b) c c adds a sparse matrix to a full one c algorithm is in-place, i.e. b is changed c c c Notes: c------- c Reinhard Furrer 2006-09-21 c----------------------------------------------------------------------- implicit none integer nrow,ja(*),ia(nrow+1) double precision a(*), b(nrow,*) integer i,k do i=1,nrow do k=ia(i),ia(i+1)-1 b(i,ja(k)) = b(i,ja(k))+a(k) enddo enddo return end subroutine constructia(nrow,nir,ia,ir) c c constructs from a regular row index vector a sparse ia vector. c note that a regular column index vector corresponds to the c sparse ja vector. for example: c A[ir,jc] => A@ja = jc, A@ia = constructia(nrow,nir,ia,ir)$ia c c nrow: row dimension of A c nir: length of ir c ir: array of length nir+1!!! c c Notes: c------- c _*Row indices have to be ordered!*_ c c Reinhard Furrer 2006-09-13 c----------------------------------------------------------------------- implicit none integer nrow,nir integer ia(nrow+1),ir(*) integer i,k k=1 ia(1)=1 do i=1,nrow 5 continue if (ir(k) .eq. i) then k=k+1 if (k .le. nir) goto 5 endif ia(i+1)=k enddo ia(nrow+1)=nir+1 return end subroutine disttospam(nrow,x,a,ja,ia,eps) implicit none integer nrow, ia(nrow+1), ja(*) double precision x(*), a(*), eps c c Convertion of an R dist object (removes zero entries as well). c c On entry: c---------- c nrow -- row dimension of the matrix c x -- elements of the dist object (is lower diagonal) c n*(i-1) - i*(i-1)/2 + j-i for i < j c c a,ja,ia -- input matrix in CSR format c c On return: c----------- c a,ja,ia -- cleaned matrix c c Notes: c------- c Reinhard Furrer 2008-08-13 c----------------------------------------------------------------------- c c Local integer i,j,k, tmp ia(1) = 1 k = 1 do i = 2, nrow ia(i) = k do j=1 , i-1 tmp = nrow*(j-1)-j*(j-1)/2+i-j if (.not.(dabs(x(tmp)) .le. eps)) then ja(k) = j a(k) = x(tmp) k = k + 1 endif enddo enddo ia(nrow+1) = k return c---- end of disttospam ------------------------------------------------- c----------------------------------------------------------------------- end subroutine setdiaold (nrow,ncol,a,ja,ia,c,jc,ic,cmax,diag,eps) implicit none double precision a(*),c(*),diag(*),eps integer nrow, ncol, ia(*), ja(*), ic(*), jc(*), cmax c c this routine sets the diagonal entries of a matix, provided they c are non-zero. c c On entry: c---------- c nrow,ncol -- dimensions of the matrix c a,ja,ia -- input matrix in CSR format c c,jc,ic -- input matrix in CSR format with enough space, see below c diag -- diagonal values to set c eps -- what is smaller than zero? c c On return: c----------- c c,jc,ic -- matrix with modified diag in CSR format c c Notes: c------- c Reinhard Furrer 2006-10-30 c----------------------------------------------------------------------- c c Local double precision b(nrow) integer i,k, len, ib(nrow+1), jb(nrow) c len=0 ib(1)=1 do i=1,nrow jb(i)=0 enddo do 10 i=1,nrow do 15 k= ia(i),ia(i+1) -1 if (ja(k) .eq. i) then a(k)=diag(i) c(k)=diag(i) ib(i+1)=ib(i) goto 10 endif if (ja(k) .gt. i) then if (diag(i).gt.eps) then len=len+1 jb(len)=i ib(i+1)=ib(i)+1 b(len)=diag(i) else ib(i+1)=ib(i) endif goto 10 endif 15 continue 10 continue if (len .eq. 0) return c c set nonexisiting elements. c call subass(nrow,ncol,a,ja,ia,b,jb,ib,c,jc,ic,cmax) return c------------end of setdia---------------------------------------------- c----------------------------------------------------------------------- end c----------------------------------------------------------------------- c c----------------------------------------------------------------------- subroutine subass(nrow,ncol,a,ja,ia,b,jb,ib,c,jc,ic,nzmax) implicit none integer nrow,ncol,nzmax integer ja(*),jb(*),jc(*),ia(*),ib(*),ic(*) double precision a(*), b(*), c(*) c----------------------------------------------------------------------- c replaces the elements of A with those of B for matrices in sorted CSR c format. we assume that each row is sorted with increasing column c indices. c----------------------------------------------------------------------- c on entry: c --------- c nrow = integer. The row dimension of A and B c ncol = integer. The column dimension of A and B. c c a,ja,ia, c b,jb,ib = Matrices A and B in compressed sparse row format with column c entries sorted ascendly in each row c c nzmax = integer. The max length of the arrays c and jc. c c on return: c---------- c c,jc,ic = resulting matrix C in compressed sparse row sparse format c with entries sorted ascendly in each row. c c Notes: c------- c Reinhard Furrer 2006-09-13, based on sparsekit2 subroutine aplb1 c----------------------------------------------------------------------- c local variables integer i,j1,j2,ka,kb,kc,kamax,kbmax kc = 1 ic(1) = kc c c looping over the rows: do 6 i=1, nrow ka = ia(i) kb = ib(i) kamax = ia(i+1)-1 kbmax = ib(i+1)-1 5 continue c If we have one or more entries then ka <= kamax c If we do not have any entries in both A and B c we will not enter the if clause. In which case c we repeatedly copy ic(i+1) <- ic(i). if (ka .le. kamax .or. kb .le. kbmax) then c j1 and j2 are left hand pointers of the first entry c of A and B. If no entry, they are set to ncol+1 if (ka .le. kamax) then j1 = ja(ka) else j1 = ncol+1 endif if (kb .le. kbmax) then j2 = jb(kb) else j2 = ncol+1 endif c c Three cases: c j1=j2: copy element of b in c, incr. all three pointers c j1j2: copy element of b in c, incr. b and c pointers if (j1 .eq. j2) then c(kc) = b(kb) jc(kc) = j1 ka = ka+1 kb = kb+1 kc = kc+1 else if (j1 .lt. j2) then jc(kc) = j1 c(kc) = a(ka) ka = ka+1 kc = kc+1 else if (j1 .gt. j2) then jc(kc) = j2 c(kc) = b(kb) kb = kb+1 kc = kc+1 endif C the next four lines should not be required... if (kc .gt. nzmax+1) then c write (*,*) "exceeding array capacities...",i,nzmax, c & ka,kb,kc,j1,j2,kamax,kbmax,ncol,jb(kb) return endif goto 5 endif ic(i+1) = kc 6 continue return c------------end-of-subass---------------------------------------------- c----------------------------------------------------------------------- end subroutine spamcsrdns(nrow,a,ja,ia,dns) implicit none integer i,k integer nrow,ja(*),ia(*) double precision dns(nrow,*),a(*) c----------------------------------------------------------------------- c Compressed Sparse Row to Dense c----------------------------------------------------------------------- c c converts a row-stored sparse matrix into a densely stored one c c On entry: c---------- c c nrow = row-dimension of a c a, c ja, c ia = input matrix in compressed sparse row format. c (a=value array, ja=column array, ia=pointer array) c dns = array where to store dense matrix c c on return: c----------- c dns = the sparse matrix a, ja, ia has been stored in dns(nrow,*) c c changes: c--------- c eliminated the ierr c eliminated the filling of zeros: all done with c----------------------------------------------------------------------- do i=1,nrow do k=ia(i),ia(i+1)-1 dns(i,ja(k)) = a(k) enddo enddo return c---- end of csrdns ---------------------------------------------------- c----------------------------------------------------------------------- end c----------------------------------------------------------------------- subroutine spamdnscsr(nrow,ncol,dns,ndns,a,ja,ia,eps) implicit none integer i,j,next integer nrow,ncol,ndns,ia(*),ja(*) double precision dns(ndns,*),a(*),eps c----------------------------------------------------------------------- c Converts a densely stored matrix into a CSR sparse matrix. c----------------------------------------------------------------------- c on entry: c--------- c c nrow = row-dimension of a c ncol = column dimension of a c nzmax = maximum number of nonzero elements allowed. This c should be set to be the lengths of the arrays a and ja. c dns = input nrow x ncol (dense) matrix. c ndns = first dimension of dns. c c on return: c---------- c c a, ja, ia = value, column, pointer arrays for output matrix c c changes: c--------- c eliminated the ierr c introduced epsilon c----------------------------------------------------------------------- next = 1 ia(1) = 1 do i=1,nrow do j=1, ncol if (.not.(dabs(dns(i,j)) .le. eps)) then ja(next) = j c write(*,*) next,dns(i,j), eps, i, j a(next) = dns(i,j) next = next+1 endif enddo ia(i+1) = next enddo return c---- end of dnscsr ---------------------------------------------------- c----------------------------------------------------------------------- end c----------------------------------------------------------------------- subroutine getmask(nrow,nnz,ir,jc,jao,iao) c----------------------------------------------------------------------- implicit none integer nrow,nnz,ir(*),jc(*),jao(*),iao(*) integer k,k0,j,i,iad c----------------------------------------------------------------------- c Gets Compressed Sparse Row indices from Coordinate ones c----------------------------------------------------------------------- c Loosely based on coocsr from Sparsekit. c c on entry: c--------- c nrow = dimension of the matrix c nnz = number of nonzero elements in matrix c ir, c jc = matrix in coordinate format. ir(k), jc(k) store the nnz c nonzero index. The order of the elements is arbitrary. c iao = vector of 0 of size nrow+1 c c on return: c----------- c ir is destroyed c c jao, iao = matrix index in general sparse matrix format with c jao containing the column indices, c and iao being the pointer to the beginning of the row c c------------------------------------------------------------------------ c determine row-lengths. do 2 k=1, nnz iao(ir(k)) = iao(ir(k))+1 2 continue c starting position of each row.. k = 1 do 3 j=1,nrow+1 k0 = iao(j) iao(j) = k k = k+k0 3 continue c go through the structure once more. Fill in output matrix. do 4 k=1, nnz i = ir(k) j = jc(k) iad = iao(i) jao(iad) = j iao(i) = iad+1 4 continue c shift back iao do 5 j=nrow,1,-1 iao(j+1) = iao(j) 5 continue iao(1) = 1 return c----------------------------------------------------------------------- end c----------------------------------------------------------------------- subroutine getblock(a,ja,ia, nrw, rw, ncl, cl, bnz, b,jb,ib) c----------------------------------------------------------------------- c purpose: c -------- c this function returns the elements a(rw,cl) of a matrix a, c for any index vector rw and cl. the matrix is assumed to be stored c in compressed sparse row (csr) format. c c c Reinhard Furrer 2006-09-12 c----------------------------------------------------------------------- c parameters: c ----------- c on entry: c---------- c a,ja,ia = the matrix a in compressed sparse row format (input). c nrw,rw c ncl,cl = length of and the vector containing the rows and columns c to extract c c on return: c----------- c bnz = nonzero elements of b c b,jb,ib = the matrix a(rw,cl) in compressed sparse row format. c c note: c------ c no error testing is done. It is assumed that b has enough space c allocated. c----------------------------------------------------------------------- implicit none integer nrw,rw(*), ncl, cl(*) integer bnz, ia(*),ja(*), ib(*),jb(*) double precision a(*),b(*) c c local variables. c integer irw, jcl, jja c c write(*,*) cl(1),cl(2) bnz = 1 ib(1) = 1 do irw = 1,nrw do jcl = 1,ncl do jja = ia(rw(irw)),ia(rw(irw)+1)-1 if (cl(jcl) .eq. ja(jja)) then c we've found one... b(bnz) = a(jja) jb(bnz) = jcl bnz = bnz + 1 endif enddo enddo ib(irw+1) = bnz c end irw, we've cycled over all lines enddo bnz = bnz - 1 c write(*,*) cl(1),cl(2) return c--------end-of-getblock------------------------------------------------ c----------------------------------------------------------------------- end c----------------------------------------------------------------------- subroutine getlines(a,ja,ia, nrw, rw, bnz, b,jb,ib) c----------------------------------------------------------------------- c purpose: c -------- c this function returns the lines rw of a matrix a. c the matrix is assumed to be stored c in compressed sparse row (csr) format. c c c Reinhard Furrer 2012-04-04 c----------------------------------------------------------------------- c parameters: c ----------- c on entry: c---------- c a,ja,ia = the matrix a in compressed sparse row format (input). c nrw,rw = length of and the vector containing the rows and columns c to extract c c on return: c----------- c bnz = nonzero elements of b c b,jb,ib = the matrix a(rw,cl) in compressed sparse row format. c c note: c------ c no error testing is done. It is assumed that b has enough space c allocated. c----------------------------------------------------------------------- implicit none integer nrw,rw(*) integer bnz, ia(*),ja(*), ib(*),jb(*) double precision a(*),b(*) c c local variables. c integer irw, jja c bnz = 1 ib(1) = 1 do irw = 1,nrw do jja = ia(rw(irw)),ia(rw(irw)+1)-1 b(bnz) = a(jja) jb(bnz) = ja(jja) bnz = bnz + 1 enddo ib(irw+1) = bnz c end irw, we've cycled over all lines enddo bnz = bnz - 1 return c--------end-of-getlines------------------------------------------------ c----------------------------------------------------------------------- end c----------------------------------------------------------------------- subroutine getelem(i,j,a,ja,ia,iadd,elem) c----------------------------------------------------------------------- c purpose: c -------- c this function returns the element a(i,j) of a matrix a, c for any pair (i,j). the matrix is assumed to be stored c in compressed sparse row (csr) format. getelem performs a c binary search. c also returns (in iadd) the address of the element a(i,j) in c arrays a and ja when the search is successsful (zero if not). c----------------------------------------------------------------------- c parameters: c ----------- c on entry: c---------- c i = the row index of the element sought (input). c j = the column index of the element sought (input). c a = the matrix a in compressed sparse row format (input). c ja = the array of column indices (input). c ia = the array of pointers to the rows' data (input). c on return: c----------- c elem = value of a(i,j). c iadd = address of element a(i,j) in arrays a, ja if found, c zero if not found. (output) c c note: the inputs i and j are not checked for validity. c----------------------------------------------------------------------- c noel m. nachtigal october 28, 1990 -- youcef saad jan 20, 1991. c c Reinhard Furrer: converted to subroutine and eliminated sorted c many manipulations... last for 0.31; Sept 13 c----------------------------------------------------------------------- implicit none integer i, ia(*), iadd, j, ja(*) double precision a(*),elem c c local variables. c integer ibeg, iend, imid c c initialization c iadd = 0 ibeg = ia(i) iend = ia(i+1)-1 c empty line! test at beginning 10 if (iend .lt. ibeg) return c c begin binary search: c test of bounds if (ja(ibeg).gt.j) return if (ja(iend).lt.j) return if (ja(ibeg).eq.j) then iadd = ibeg goto 20 endif if (ja(iend).eq.j) then iadd = iend goto 20 endif c compute the middle index and test if found imid = ( ibeg + iend ) / 2 if (ja(imid).eq.j) then iadd = imid goto 20 endif c update the interval bounds. if (ja(imid).gt.j) then iend = imid -1 else ibeg = imid +1 endif goto 10 c c set iadd and elem before returning 20 elem = a(iadd) return c--------end-of-getelem------------------------------------------------- c----------------------------------------------------------------------- end subroutine getallelem(nir,ir,jr,a,ja,ia,alliadd,allelem) c----------------------------------------------------------------------- c purpose: c -------- c wrapper to getelem to retrieve several elements. c----------------------------------------------------------------------- c Reinhard Furrer 2006-09-12 c----------------------------------------------------------------------- implicit none integer nir,ir(nir),jr(nir),ja(*),ia(*),alliadd(nir) double precision a(*),allelem(nir) c local vars integer i do i = 1,nir call getelem(ir(i),jr(i),a,ja,ia,alliadd(i),allelem(i)) enddo return c--------end-of-allgetelem---------------------------------------------- c----------------------------------------------------------------------- end c----------------------------------------------------------------------- c----------------------------------------------------------------------- c- c- Modified by P. T. Ng from sparsekit c----------------------------------------------------------------------- c----------------------------------------------------------------------- subroutine aemub (nrow,ncol,a,ja,ia,amask,jmask,imask, * c,jc,ic,iw,aw,nzmax,ierr) c--------------------------------------------------------------------- real(8) a(*),c(*),amask(*),aw(ncol) integer ia(nrow+1),ja(*),jc(*),ic(nrow+1),jmask(*),imask(nrow+1) logical iw(ncol) c----------------------------------------------------------------------- c Modified from amask by Pin T. Ng on 2/27/03 to perform c element-wise multiplication c----------------------------------------------------------------------- c On entry: c--------- c nrow = integer. row dimension of input matrix c ncol = integer. Column dimension of input matrix. c c a, c ja, c ia = the A matrix in Compressed Sparse Row format c c amask, c jmask, c imask = matrix defining mask stored in compressed c sparse row format. (This is the B matrix) c c nzmax = length of arrays c and jc. see ierr. c c On return: c----------- c c a, ja, ia and amask, jmask, imask are unchanged. c c c c jc, c ic = the output matrix in Compressed Sparse Row format. c c ierr = integer. serving as error message.c c ierr = 1 means normal return c ierr .gt. 1 means that amask stopped when processing c row number ierr, because there was not enough space in c c, jc according to the value of nzmax. c c work arrays: c------------- c iw = logical work array of length ncol. c aw = real work array of length ncol. c c note: c------ the algorithm is in place: c, jc, ic can be the same as c a, ja, ia in which cas the code will overwrite the matrix c c on a, ja, ia c c----------------------------------------------------------------------- ierr = 0 len = 0 do 1 j=1, ncol iw(j) = .false. aw(j) = 0.0 1 continue c unpack the mask for row ii in iw do 100 ii=1, nrow c save pointer and value in order to be able to do things in place do 2 k=imask(ii), imask(ii+1)-1 iw(jmask(k)) = .true. aw(jmask(k)) = amask(k) 2 continue c add umasked elemnts of row ii k1 = ia(ii) k2 = ia(ii+1)-1 ic(ii) = len+1 do 200 k=k1,k2 j = ja(k) if (iw(j)) then len = len+1 if (len .gt. nzmax) then ierr = ii return endif jc(len) = j c(len) = a(k)*aw(j) endif 200 continue c do 3 k=imask(ii), imask(ii+1)-1 iw(jmask(k)) = .false. aw(jmask(k)) = 0.0 3 continue 100 continue ic(nrow+1)=len+1 c return c-----end-of-aemub ----------------------------------------------------- c----------------------------------------------------------------------- end c----------------------------------------------------------------------- subroutine aemub1 (nrow,ncol,a,ja,ia,b,jb,ib,c,jc,ic, * nzmax,ierr) real(8) a(*), b(*), c(*) integer ja(*),jb(*),jc(*),ia(nrow+1),ib(nrow+1),ic(nrow+1) c----------------------------------------------------------------------- c A modification of aplsb by Pin Ng on 6/12/02 to c perform the element-wise operation C = A*B for matrices in c sorted CSR format. c the difference with aplsb is that the resulting matrix is such that c the elements of each row are sorted with increasing column indices in c each row, provided the original matrices are sorted in the same way. c----------------------------------------------------------------------- c on entry: c --------- c nrow = integer. The row dimension of A and B c ncol = integer. The column dimension of A and B. c c a, c ja, c ia = Matrix A in compressed sparse row format with entries sorted c c b, c jb, c ib = Matrix B in compressed sparse row format with entries sorted c ascendly in each row c c nzmax = integer. The length of the arrays c and jc. c amub will stop if the result matrix C has a number c of elements that exceeds exceeds nzmax. See ierr. c c on return: c---------- c c, c jc, c ic = resulting matrix C in compressed sparse row sparse format c with entries sorted ascendly in each row. c c ierr = integer. serving as error message. c ierr = 0 means normal return, c ierr .gt. 0 means that amub stopped while computing the c i-th row of C with i=ierr, because the number c of elements in C exceeds nzmax. c c Notes: c------- c this will not work if any of the two input matrices is not sorted c----------------------------------------------------------------------- ierr = 0 kc = 1 ic(1) = kc c c the following loop does a merge of two sparse rows and c multiplies them. c do 6 i=1, nrow ka = ia(i) kb = ib(i) kamax = ia(i+1)-1 kbmax = ib(i+1)-1 5 continue c c this is a while -- do loop -- c if (ka .le. kamax .or. kb .le. kbmax) then c if (ka .le. kamax) then j1 = ja(ka) else c take j1 large enough that always j2 .lt. j1 j1 = ncol+1 endif if (kb .le. kbmax) then j2 = jb(kb) else c similarly take j2 large enough that always j1 .lt. j2 j2 = ncol+1 endif c c three cases c if (j1 .eq. j2) then c(kc) = a(ka)*b(kb) jc(kc) = j1 ka = ka+1 kb = kb+1 kc = kc+1 else if (j1 .lt. j2) then ka = ka+1 else if (j1 .gt. j2) then kb = kb+1 endif if (kc .gt. nzmax) goto 999 goto 5 c c end while loop c endif ic(i+1) = kc 6 continue return 999 ierr = i return c------------end-of-aemub1 --------------------------------------------- c----------------------------------------------------------------------- end c----------------------------------------------------------------------- subroutine aedib (nrow,ncol,job,a,ja,ia,b,jb,ib, * c,jc,ic,nzmax,iw,aw,ierr) real(8) a(*), b(*), c(*), aw(ncol) integer ja(*),jb(*),jc(*),ia(nrow+1),ib(nrow+1),ic(nrow+1), * iw(ncol) c----------------------------------------------------------------------- c performs the element-wise matrix division C = A/B. c Modified from aplsb by Pin Ng on 2/27/03 c----------------------------------------------------------------------- c on entry: c --------- c nrow = integer. The row dimension of A and B c ncol = integer. The column dimension of A and B. c job = integer. Job indicator. When job = 0, only the structure c (i.e. the arrays jc, ic) is computed and the c real values are ignored. c c a, c ja, c ia = Matrix A in compressed sparse row format. c c b, c jb, c ib = Matrix B in compressed sparse row format. c c nzmax = integer. The length of the arrays c and jc. c amub will stop if the result matrix C has a number c of elements that exceeds exceeds nzmax. See ierr. c c on return: c---------- c c, c jc, c ic = resulting matrix C in compressed sparse row sparse format. c c ierr = integer. serving as error message. c ierr = 0 means normal return, c ierr .gt. 0 means that amub stopped while computing the c i-th row of C with i=ierr, because the number c of elements in C exceeds nzmax. c c work arrays: c------------ c iw = integer work array of length equal to the number of c columns in A. c aw = real work array of length equal to the number of c columns in A. c c----------------------------------------------------------------------- logical values values = (job .ne. 0) ierr = 0 len = 0 ic(1) = 1 do 1 j=1, ncol iw(j) = 0 1 continue c do 500 ii=1, nrow c row i do 200 ka=ia(ii), ia(ii+1)-1 len = len+1 jcol = ja(ka) if (len .gt. nzmax) goto 999 jc(len) = jcol if (values) c(len) = a(ka)/0.0 iw(jcol)= len aw(jcol) = a(ka) 200 continue c do 300 kb=ib(ii),ib(ii+1)-1 jcol = jb(kb) jpos = iw(jcol) if (jpos .eq. 0) then len = len+1 if (len .gt. nzmax) goto 999 jc(len) = jcol if (values) c(len) = 0.0 iw(jcol)= len else if (values) c(jpos) = aw(jcol)/b(kb) endif 300 continue do 301 k=ic(ii), len iw(jc(k)) = 0 301 continue ic(ii+1) = len+1 500 continue return 999 ierr = ii return c------------end of aedib ----------------------------------------------- c----------------------------------------------------------------------- end c----------------------------------------------------------------------- subroutine aeexpb (nrow,ncol,job,a,ja,ia,b,jb,ib, * c,jc,ic,nzmax,iw,aw,ierr) real(8) a(*), b(*), c(*), aw(ncol) integer ja(*),jb(*),jc(*),ia(nrow+1),ib(nrow+1),ic(nrow+1), * iw(ncol) c----------------------------------------------------------------------- c performs the element-wise matrix division C = A/B. c Modified from aplsb by Pin Ng on 2/27/03 c----------------------------------------------------------------------- c on entry: c --------- c nrow = integer. The row dimension of A and B c ncol = integer. The column dimension of A and B. c job = integer. Job indicator. When job = 0, only the structure c (i.e. the arrays jc, ic) is computed and the c real values are ignored. c c a, c ja, c ia = Matrix A in compressed sparse row format. c c b, c jb, c ib = Matrix B in compressed sparse row format. c c nzmax = integer. The length of the arrays c and jc. c amub will stop if the result matrix C has a number c of elements that exceeds exceeds nzmax. See ierr. c c on return: c---------- c c, c jc, c ic = resulting matrix C in compressed sparse row sparse format. c c ierr = integer. serving as error message. c ierr = 0 means normal return, c ierr .gt. 0 means that amub stopped while computing the c i-th row of C with i=ierr, because the number c of elements in C exceeds nzmax. c c work arrays: c------------ c iw = integer work array of length equal to the number of c columns in A. c aw = real work array of length equal to the number of c columns in A. c c----------------------------------------------------------------------- logical values values = (job .ne. 0) ierr = 0 len = 0 ic(1) = 1 do 1 j=1, ncol iw(j) = 0 1 continue c do 500 ii=1, nrow c row i do 200 ka=ia(ii), ia(ii+1)-1 len = len+1 jcol = ja(ka) if (len .gt. nzmax) goto 999 jc(len) = jcol if (values) c(len) = 1.0 iw(jcol)= len aw(jcol) = a(ka) 200 continue c do 300 kb=ib(ii),ib(ii+1)-1 jcol = jb(kb) jpos = iw(jcol) if (jpos .eq. 0) then len = len+1 if (len .gt. nzmax) goto 999 jc(len) = jcol if (values) c(len) = 0.0**b(kb) iw(jcol)= len else if (values) c(jpos) = aw(jcol)**b(kb) endif 300 continue do 301 k=ic(ii), len iw(jc(k)) = 0 301 continue ic(ii+1) = len+1 500 continue return 999 ierr = ii return c------------end of aeexpb ----------------------------------------------- c----------------------------------------------------------------------- end SUBROUTINE CALCJA(nrow,nsuper, % xsuper,lindx,xlindx,xlnz, % cholcja) c small function to calculate ja for the cholesky factor c as they use a condensed format. GRATULIERU LIT! c INPUT: c nrow (integer) number of rows c nsuper (integer) number of supernodes c xsuper (integer) supernode partition c xlindx,lindx (integer) row indices for each supernode c xlnz (integer) ia for cholesky factor c c OUTPUT: c cholcja (integer) ja for cholesky factor IMPLICIT NONE INTEGER nrow,nsuper INTEGER xsuper(nrow),lindx(*),xlindx(nrow+1),xlnz(nrow+1) INTEGER cholcja(*) INTEGER k, i, j, m, n k=1 m=1 DO i=1,nsuper DO j=1,( xsuper(i+1)-xsuper(i)) DO n=1,(xlnz(k+1)-xlnz(k)) cholcja(m)=lindx( xlindx(i)+j-2 + n) m=m+1 ENDDO k=k+1 ENDDO ENDDO RETURN END subroutine transpose(n,m,a,ja,ia,ao,jao,iao) implicit none integer n,m,ia(n+1),iao(m+1),ja(*),jao(*) double precision a(*),ao(*) integer i,j,k,next c----------------------------------------------------------------------- c Transposition c similar to csrcsc from sparsekit c----------------------------------------------------------------------- c on entry: c---------- c n = number of rows of CSR matrix. c m = number of columns of CSC matrix. c a = real array of length nnz (nnz=number of nonzero elements in input c matrix) containing the nonzero elements. c ja = integer array of length nnz containing the column positions c of the corresponding elements in a. c ia = integer of size n+1. ia(k) contains the position in a, ja of c the beginning of the k-th row. c c on return: c ---------- c ao = real array of size nzz containing the "a" part of the transpose c jao = integer array of size nnz containing the column indices. c iao = integer array of size n+1 containing the "ia" index array of c the transpose. c c----------------------------------------------------------------------- c----------------- compute lengths of rows of transp(A) ---------------- do i=1, n do k=ia(i), ia(i+1)-1 j = ja(k)+1 iao(j) = iao(j)+1 enddo enddo c---------- compute pointers from lengths ------------------------------ iao(1) = 1 do i=1,m iao(i+1) = iao(i) + iao(i+1) enddo c--------------- now do the actual copying ----------------------------- do i=1,n do k=ia(i),ia(i+1)-1 j = ja(k) next = iao(j) ao(next) = a(k) jao(next) = i iao(j) = next+1 enddo enddo c-------------------------- reshift iao and leave ---------------------- do i=m,1,-1 iao(i+1) = iao(i) enddo iao(1) = 1 c----------------------------------------------------------------------- end c----------------------------------------------------------------------- c----------------------------------------------------------------------- subroutine reducedim(a,ja,ia,eps,bnrow,bncol,k,b,jb,ib) implicit none double precision a(*),b(*),eps integer bnrow, bncol,k integer ia(*),ja(*),ib(*),jb(*) integer i, j, jaj c----------------------------------------------------------------------- c Reduces the dimension of A to (,bnrow,bncol) by copying it to B. c (Hence not in place - for R purposes). c Only elements smaller than eps are copied. c----------------------------------------------------------------------- c on entry: c--------- c c------------------------------------------------------------------------ k=1 do i=1,bnrow ib(i)=k do j=ia(i), ia(i+1)-1 jaj=ja(j) if (jaj .le.bncol) then if (abs( a(j)) .gt. eps) then b(k)=a(j) jb(k)=jaj k=k+1 endif endif enddo enddo ib(bnrow+1)=k return c----------------------------------------------------------------------- end c----------------------------------------------------------------------- c Currently not used... subroutine reducediminplace(eps,nrow,ncol,k,a,ja,ia) implicit none double precision a(*),eps integer nrow, ncol,k integer ia(*),ja(*) integer i, j, jj, itmp c----------------------------------------------------------------------- c Reduces the dimension of A to (nrow,ncol) _in place_ c Only elements smaller than eps are copied. c----------------------------------------------------------------------- c Reinhard Furrer, June 2008 c------------------------------------------------------------------------ k=1 do i=1,nrow itmp = ia(i) ia(i)=k do j=itmp, ia(i+1)-1 jj=ja(j) if (jj .le. ncol) then if (abs( a(jj)) .gt. eps) then a(k)=a(jj) ja(k)=jj k=k+1 endif endif enddo enddo ia(nrow+1)=k return c----------------------------------------------------------------------- end c----------------------------------------------------------------------- c----------------------------------------------------------------------c c T R I A N G U L A R S Y S T E M S O L U T I O N S c c c c spamforward and spamback c c----------------------------------------------------------------------c subroutine spamforward (n,p,x,b,l,jl,il) implicit none integer n, p, jl(*),il(n+1) double precision x(n,p), b(n,p), l(*) integer i, k, j double precision t k = 0 c----------------------------------------------------------------------- c solves L x = y ; L = lower triang. / CSR format c sequential forward elimination c----------------------------------------------------------------------- c c On entry: c---------- c n,p = integer. dimensions of problem. c b = real array containg the right side. c c l, jl, il, = Lower triangular matrix stored in CSR format. c c On return: c----------- c x = The solution of L x = b. c-------------------------------------------------------------------- c Reinhard Furrer June 2008, April 2012 c if first diagonal element is zero, break if (l(1) .eq. 0.0 ) goto 5 c cycle over all columns of b do i=1,p c first row has one element then cycle over all rows x(1,i) = b(1,i) / l(1) do 3 k = 2, n t = b(k,i) do 1 j = il(k), il(k+1)-1 if (jl(j) .lt. k) then t = t-l(j)*x(jl(j),i) else if (jl(j) .eq. k) then if (l(j) .eq. 0.0) goto 5 c diagonal element is not zero, hence we divide and leave the loop x(k,i) = t / l(j) goto 3 endif endif 1 continue 3 continue enddo return 5 n = -k return end c----------------------------------------------------------------------- subroutine spamback (n,p,x,b,r,jr,ir) implicit none integer n, p, jr(*),ir(n+1) double precision x(n,p), b(n,p), r(*) integer l, k, j double precision t c----------------------------------------------------------------------- c Solves R x = b R = upper triangular. c----------------------------------------------------------------------- c c On entry: c---------- c n,p = integers. dimension of problem. c b = real array containg the right side. c c r, jr, ir, = Upper triangular matrix stored in CSR format. c c On return: c----------- c x = The solution of R x = b . c-------------------------------------------------------------------- c Reinhard Furrer June 2008, April 2012 k = 0 if (r(ir(n+1)-1) .eq. 0.0 ) goto 5 do l=1,p x(n,l) = b(n,l) / r(ir(n+1)-1) do 3 k = n-1,1,-1 t = b(k,l) do 1 j = ir(k+1)-1,ir(k),-1 if (jr(j) .gt. k) then t = t - r(j)*x(jr(j),l) else if (jr(j) .eq. k) then if (r(j) .eq. 0.0) goto 5 c diagonal element is not zero, hence we divide and leave the loop x(k,l) = t / r(j) goto 3 endif endif 1 continue 3 continue enddo return 5 n = -k return end c----------------------------------------------------------------------- RandomFieldsUtils/src/cholmodified.f0000644000176200001440000064606014227157055017252 0ustar liggesusersc c Authors: c Reinhard Furrer c c Copyright (C) 2017 -- 2017 Reinhard Furrer c c This program is free software; you can redistribute it and/or c modify it under the terms of the GNU General Public License c as published by the Free Software Foundation; either version 3 c of the License, or (at your option) any later version. c c This program is distributed in the hope that it will be useful, c but WITHOUT ANY WARRANTY; without even the implied warranty of c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the c GNU General Public License for more details. c c You should have received a copy of the GNU General Public License c along with this program; if not, write to the Free Software c Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. subroutine updatefactor( m,nnzd, & d,jd,id, invp,perm, & lindx,xlindx, nsuper,lnz,xlnz, & snode, xsuper, & cachesize,ierr) implicit none integer m,nnzd integer nsuper,tmpsiz, & ierr, & jd(nnzd),cachesize, & id(m+1),lindx(*),xlindx(*), & invp(m),perm(m),xlnz(m+1), & snode(m),xsuper(m+1) double precision d(nnzd),lnz(*) c temp and working stuff, loops, etc integer iwork(7*m+3) integer split(m) c c Clean L call cleanlnz(nsuper,xsuper,xlnz,lnz) c c Input numerical values into data structures of L call inpnv(id,jd,d,perm,invp,nsuper,xsuper,xlindx,lindx, & xlnz,lnz,iwork) c c Initialization for block factorization call bfinit(m,nsuper,xsuper,snode,xlindx,lindx,cachesize,tmpsiz, & split) c c Numerical factorization call blkfc2(nsuper,xsuper,snode,split,xlindx,lindx,xlnz, & lnz,iwork(1),iwork(nsuper+1),iwork(2*nsuper+1), & iwork(2*nsuper+m+1),tmpsiz,ierr) if (ierr .eq. -1) then ierr = 1 go to 100 elseif (ierr .eq. -2) then ierr = 3 go to 100 endif 100 continue return end subroutine cholstepwise(m,nnzd, & d,jd,id, doperm,invp,perm, & nsub,nsubmax, & lindx,xlindx,nsuper,nnzlmax,lnz,xlnz, & snode,xsuper, & cachsz,ierr) c Modified chol routine c c c Sparse least squares solver via Ng-Peyton's sparse Cholesky c factorization for sparse symmetric positive definite c INPUT: c m -- the number of column in the matrix A c d -- an nnzd-vector of non-zero values of A c jd -- an nnzd-vector of indices in d c id -- an (m+1)-vector of pointers to the begining of each c row in d and jd c nsubmax -- upper bound of the dimension of lindx c lindx -- an nsub-vector of integer which contains, in c column major oder, the row subscripts of the nonzero c entries in L in a compressed storage format c xlindx -- an nsuper-vector of integer of pointers for lindx c nsuper -- the length of xlindx ??? c nnzlmax -- the upper bound of the non-zero entries in c L stored in lnz, including the diagonal entries c lnz -- First contains the non-zero entries of d; later c contains the entries of the Cholesky factor c xlnz -- column pointer for L stored in lnz c invp -- an n-vector of integer of inverse permutation c vector c perm -- an n-vector of integer of permutation vector c colcnt -- array of length m, containing the number of c non-zeros in each column of the factor, including c the diagonal entries c snode -- array of length m for recording supernode c membership c xsuper -- array of length m+1 containing the supernode c partitioning c split -- an m-vector with splitting of supernodes so that c they fit into cache c tmpmax -- upper bound of the dimension of tmpvec c tmpvec -- a tmpmax-vector of temporary vector c cachsz -- size of the cache (in kilobytes) on the target c machine c ierr -- error flag c 1 -- insufficient work space in call to extract c 2 -- insufficient storage in iwork when calling ordmmd; c 3 -- insufficient storage in iwork when calling sfinit; c 4 -- nnzl > nnzlmax when calling sfinit c 5 -- nsub > nsubmax when calling sfinit c 6 -- insufficient work space in iwork when calling symfct c 7 -- inconsistancy in input when calling symfct c 8 -- tmpsiz > tmpmax when calling symfct; increase tmpmax c 9 -- nonpositive diagonal encountered when calling c blkfct c 10 -- insufficient work storage in tmpvec when calling c blkfct c 11 -- insufficient work storage in iwork when calling c blkfct c OUTPUT: c y -- an m-vector of least squares solution c nsub -- number of subscripts in lindx c WORK ARRAYS: c adjncy -- the indices of non diag elements c iwsiz -- set at 7*m+3 c iwork -- an iwsiz-vector of integer as work space c c implicit none integer m,nnzd,doperm integer nsub,nsuper,nnzl,iwsiz,tmpsiz, & nnzlmax,nsubmax,cachsz,ierr, & adj(m+1),adjncy(nnzd-m+1),jd(nnzd), c fix introduced in 29-3 c & adj(m+1),adjncy(nnzd-m),jd(nnzd), & id(m+1),lindx(nsubmax),xlindx(m+1), & invp(m),perm(m),xlnz(m+1), & colcnt(m),snode(m),xsuper(m+1),split(m) double precision d(nnzd),lnz(nnzlmax) c temp and working stuff, loops, etc integer i,j,k, nnzadj, jtmp integer iwork(7*m+3) c iwsiz is used temporalily iwsiz=0 c Create the adjacency matrix: eliminate the diagonal elements from c (d,id,jd) and make two copies: (*,xlindx,lindx),(*,adj,adjncy) c Also to lindx and xlindx, because the matrix structure is destroyed c by the minimum degree ordering routine. nsub = 0 c the adj matrix has m elements less than d nnzadj = nnzd - m k=1 do i=1,m c copy id, but ajust for the missing diagonal. xlindx(i) = id(i)-i+1 adj(i) = xlindx(i) c now cycle over all rows do j=id(i),id(i+1)-1 jtmp=jd(j) if (jtmp.ne.i) then lindx(k) = jtmp adjncy(k) = jtmp k=k+1 else if ( d(j) .le. 0) then ierr = 1 return endif iwsiz = iwsiz + 1 endif enddo enddo jtmp=m+1 xlindx(jtmp) = id(jtmp)-m adj(jtmp) = xlindx(jtmp) c check if we actually had m elements on the diagonal... if ( iwsiz .lt. m) then ierr = 1 return endif c initialize iwsiz to the later used value... iwsiz=7*m+3 c c c reorder the matrix using minimum degree ordering routine. c we call the genmmd function directly (do not pass via ordmmd). if (doperm.eq.1) then c delta - tolerance value for multiple elimination. c set to 0 below c maxint - maximum machine representable (short) integer c (any smaller estimate will do) for marking c nodes. c set to 32767 below call genmmd ( m, xlindx,lindx, invp,perm,0, 1 iwork(1), iwork(m+1), iwork(2*m+1), iwork(3*m+1) , 1 32767, nsub ) endif if (doperm.eq.2) then call genrcm ( m, nnzadj, xlindx,lindx, perm ) do i=1,m invp(perm(i))=i enddo endif if (doperm.eq.0) then do i=1,m invp(perm(i))=i enddo endif c c Call sfinit: Symbolic factorization initialization c to compute supernode partition and storage requirements c for symbolic factorization. New ordering is a postordering c of the nodal elimination tree c call sfinit(m,nnzadj,adj(1),adjncy(1),perm, & invp,colcnt,nnzl,nsub,nsuper,snode,xsuper,iwsiz, & iwork,ierr) c we do not have to test ierr, as we have hardwired iwsiz to 7*m+3 if (nnzl .gt. nnzlmax) then ierr = 4 go to 100 endif if (nsub .gt. nsubmax) then ierr = 5 go to 100 endif c c Call symfct: Perform supernodal symbolic factorization c iwsiz = nsuper + 2 * m + 1 call symfc2(m,nnzadj,adj(1),adjncy(1),perm,invp, & colcnt,nsuper,xsuper,snode,nsub,xlindx,lindx, & xlnz, & iwork(1), iwork(nsuper+1), iwork(nsuper+m+2) ,ierr) c ierr = -2 "inconsistency in the input" if (ierr .eq. -2) then ierr = 6 go to 100 endif c c Input numerical values into data structures of L call inpnv(id,jd,d,perm,invp,nsuper,xsuper,xlindx,lindx, & xlnz,lnz,iwork) c c Initialization for block factorization call bfinit(m,nsuper,xsuper,snode,xlindx,lindx,cachsz,tmpsiz, & split) c c Numerical factorization call blkfc2(nsuper,xsuper,snode,split,xlindx,lindx,xlnz, & lnz,iwork(1),iwork(nsuper+1),iwork(2*nsuper+1), & iwork(2*nsuper+m+1),tmpsiz,ierr) if (ierr .eq. -1) then ierr = 1 go to 100 elseif (ierr .eq. -2) then ierr = 3 go to 100 endif 100 continue c WRITE(6,699) nnzd c699 FORMAT(1X,' FOUND ',I6,' RETURNING!') return end C*********************************************************************** C*********************************************************************** C C Authors: Reinhard Furrer, based on inpnv C C C*********************************************************************** C*********************************************************************** C C ------------------------------------------------------ C Clean the array lnz C ------------------------------------------------------ C SUBROUTINE CLEANLNZ (NSUPER, XSUPER, XLNZ, LNZ) C IMPLICIT NONE INTEGER NSUPER INTEGER XSUPER(*), XLNZ(*) DOUBLE PRECISION LNZ(*) C INTEGER II, J, JSUPER C DO 500 JSUPER = 1, NSUPER DO 400 J = XSUPER(JSUPER), XSUPER(JSUPER+1)-1 DO 200 II = XLNZ(J), XLNZ(J+1)-1 LNZ(II) = 0.0 200 CONTINUE 400 CONTINUE C 500 CONTINUE RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C************ ASSMB .... INDEXED ASSEMBLY OPERATION ************ C*********************************************************************** C*********************************************************************** C C PURPOSE: C THIS ROUTINE PERFORMS AN INDEXED ASSEMBLY (I.E., SCATTER-ADD) C OPERATION, ASSUMING DATA STRUCTURES USED IN SOME OF OUR SPARSE C CHOLESKY CODES. C C INPUT PARAMETERS: C M - NUMBER OF ROWS IN Y. C Q - NUMBER OF COLUMNS IN Y. C Y - BLOCK UPDATE TO BE INCORPORATED INTO FACTOR C STORAGE. C RELIND - RELATIVE INDICES FOR MAPPING THE UPDATES C ONTO THE TARGET COLUMNS. C XLNZ - POINTERS TO THE START OF EACH COLUMN IN THE C TARGET MATRIX. C C OUTPUT PARAMETERS: C LNZ - CONTAINS COLUMNS MODIFIED BY THE UPDATE C MATRIX. C C*********************************************************************** C SUBROUTINE ASSMB ( M , Q , Y , RELIND, XLNZ , & LNZ , LDA ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- C INTEGER LDA , M , Q INTEGER XLNZ(*) INTEGER RELIND(*) DOUBLE PRECISION LNZ(*) , Y(*) C C ---------------- C LOCAL VARIABLES. C ---------------- C INTEGER ICOL , IL1 , IR , IY1 , LBOT1 , & YCOL , YOFF1 C C*********************************************************************** C C YOFF1 = 0 IY1 = 0 DO 200 ICOL = 1, Q YCOL = LDA - RELIND(ICOL) LBOT1 = XLNZ(YCOL+1) - 1 CDIR$ IVDEP DO 100 IR = ICOL, M IL1 = LBOT1 - RELIND(IR) IY1 = YOFF1 + IR LNZ(IL1) = LNZ(IL1) + Y(IY1) Y(IY1) = 0.0D0 100 CONTINUE YOFF1 = IY1 - ICOL 200 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Joseph W.H. Liu C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C****** BETREE ..... BINARY TREE REPRESENTATION OF ETREE ******* C*********************************************************************** C*********************************************************************** C C WRITTEN BY JOSEPH LIU (JUL 17, 1985) C C PURPOSE: C TO DETERMINE THE BINARY TREE REPRESENTATION OF THE ELIMINATION C TREE GIVEN BY THE PARENT VECTOR. THE RETURNED REPRESENTATION C WILL BE GIVEN BY THE FIRST-SON AND BROTHER VECTORS. THE ROOT C OF THE BINARY TREE IS ALWAYS NEQNS. C C INPUT PARAMETERS: C NEQNS - NUMBER OF EQUATIONS. C PARENT - THE PARENT VECTOR OF THE ELIMINATION TREE. C IT IS ASSUMED THAT PARENT(I) > I EXCEPT OF C THE ROOTS. C C OUTPUT PARAMETERS: C FSON - THE FIRST SON VECTOR. C BROTHR - THE BROTHER VECTOR. C C*********************************************************************** C SUBROUTINE BETREE ( NEQNS , PARENT, FSON , BROTHR ) C C*********************************************************************** C INTEGER(4) BROTHR(*) , FSON(*) , & PARENT(*) C INTEGER(4) NEQNS C C*********************************************************************** C INTEGER(4) LROOT , NODE , NDPAR C C*********************************************************************** C IF ( NEQNS .LE. 0 ) RETURN C DO 100 NODE = 1, NEQNS FSON(NODE) = 0 BROTHR(NODE) = 0 100 CONTINUE LROOT = NEQNS C ------------------------------------------------------------ C FOR EACH NODE := NEQNS-1 STEP -1 DOWNTO 1, DO THE FOLLOWING. C ------------------------------------------------------------ IF ( NEQNS .LE. 1 ) RETURN DO 300 NODE = NEQNS-1, 1, -1 NDPAR = PARENT(NODE) IF ( NDPAR .LE. 0 .OR. NDPAR .EQ. NODE ) THEN C ------------------------------------------------- C NODE HAS NO PARENT. GIVEN STRUCTURE IS A FOREST. C SET NODE TO BE ONE OF THE ROOTS OF THE TREES. C ------------------------------------------------- BROTHR(LROOT) = NODE LROOT = NODE ELSE C ------------------------------------------- C OTHERWISE, BECOMES FIRST SON OF ITS PARENT. C ------------------------------------------- BROTHR(NODE) = FSON(NDPAR) FSON(NDPAR) = NODE ENDIF 300 CONTINUE BROTHR(LROOT) = 0 C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C****** BFINIT ..... INITIALIZATION FOR BLOCK FACTORIZATION ****** C*********************************************************************** C*********************************************************************** C C PURPOSE: C THIS SUBROUTINE COMPUTES ITEMS NEEDED BY THE LEFT-LOOKING C BLOCK-TO-BLOCK CHOLESKY FACTORITZATION ROUTINE BLKFCT. C C INPUT PARAMETERS: C NEQNS - NUMBER OF EQUATIONS. C NSUPER - NUMBER OF SUPERNODES. C XSUPER - INTEGER ARRAY OF SIZE (NSUPER+1) CONTAINING C THE SUPERNODE PARTITIONING. C SNODE - SUPERNODE MEMBERSHIP. C (XLINDX,LINDX) - ARRAYS DESCRIBING THE SUPERNODAL STRUCTURE. C CACHSZ - CACHE SIZE (IN KBYTES). C C OUTPUT PARAMETERS: C TMPSIZ - SIZE OF WORKING STORAGE REQUIRED BY BLKFCT. C SPLIT - SPLITTING OF SUPERNODES SO THAT THEY FIT C INTO CACHE. C C*********************************************************************** C SUBROUTINE BFINIT ( NEQNS , NSUPER, XSUPER, SNODE , XLINDX, & LINDX , CACHSZ, TMPSIZ, SPLIT ) C C*********************************************************************** C INTEGER CACHSZ, NEQNS , NSUPER, TMPSIZ INTEGER XLINDX(*) , XSUPER(*) INTEGER LINDX (*) , SNODE (*) , & SPLIT(*) C C*********************************************************************** C C --------------------------------------------------- C DETERMINE FLOATING POINT WORKING SPACE REQUIREMENT. C --------------------------------------------------- CALL FNTSIZ ( NSUPER, XSUPER, SNODE , XLINDX, LINDX , & TMPSIZ ) C C ------------------------------- C PARTITION SUPERNODES FOR CACHE. C ------------------------------- CALL FNSPLT ( NEQNS , NSUPER, XSUPER, XLINDX, CACHSZ, & SPLIT ) C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.3 C Last modified: March 6, 1995 C Authors: Esmond G. Ng and Barry W. Peyton C RF eliminated dependence on SMXPY and MMPY C C Mathematical Sciences Section, Oak Ridge National Laboratoy C C*********************************************************************** C*********************************************************************** C********* BLKFC2 ..... BLOCK GENERAL SPARSE CHOLESKY ********* C*********************************************************************** C*********************************************************************** C C PURPOSE: C THIS SUBROUTINE FACTORS A SPARSE POSITIVE DEFINITE MATRIX. C THE COMPUTATION IS ORGANIZED AROUND KERNELS THAT PERFORM C SUPERNODE-TO-SUPERNODE UPDATES, I.E., BLOCK-TO-BLOCK UPDATES. C C INPUT PARAMETERS: C NSUPER - NUMBER OF SUPERNODES. C XSUPER - SUPERNODE PARTITION. C SNODE - MAPS EACH COLUMN TO THE SUPERNODE CONTAINING C IT. C SPLIT - SPLITTING OF SUPERNODES SO THAT THEY FIT C INTO CACHE. C (XLINDX,LINDX) - ROW INDICES FOR EACH SUPERNODE (INCLUDING C THE DIAGONAL ELEMENTS). C (XLNZ,LNZ) - ON INPUT, CONTAINS MATRIX TO BE FACTORED. C TMPSIZ - SIZE OF TEMPORARY WORKING STORAGE. C C OUTPUT PARAMETERS: C LNZ - ON OUTPUT, CONTAINS CHOLESKY FACTOR. C IFLAG - ERROR FLAG. C 0: SUCCESSFUL FACTORIZATION. C -1: NONPOSITIVE DIAGONAL ENCOUNTERED, C MATRIX IS NOT POSITIVE DEFINITE. C -2: INSUFFICIENT WORKING STORAGE C [TEMP(*)]. C C WORKING PARAMETERS: C LINK - LINKS TOGETHER THE SUPERNODES IN A SUPERNODE C ROW. C LENGTH - LENGTH OF THE ACTIVE PORTION OF EACH C SUPERNODE. C INDMAP - VECTOR OF SIZE NEQNS INTO WHICH THE GLOBAL C INDICES ARE SCATTERED. C RELIND - MAPS LOCATIONS IN THE UPDATING COLUMNS TO C THE CORRESPONDING LOCATIONS IN THE UPDATED C COLUMNS. (RELIND IS GATHERED FROM INDMAP). C TEMP - REAL VECTOR FOR ACCUMULATING UPDATES. MUST C ACCOMODATE ALL COLUMNS OF A SUPERNODE. C C*********************************************************************** C SUBROUTINE BLKFC2 ( NSUPER, XSUPER, SNODE , SPLIT , XLINDX, & LINDX , XLNZ , LNZ , LINK , LENGTH, & INDMAP, RELIND, TMPSIZ, IFLAG ) C C********************************************************************* C C ----------- C PARAMETERS. C ----------- C INTEGER XLINDX(*) , XLNZ(*) INTEGER INDMAP(*) , LENGTH(*) , & LINDX(*) , LINK(*) , & RELIND(*) , SNODE(*) , & SPLIT(*) , XSUPER(*) INTEGER IFLAG , NSUPER, TMPSIZ DOUBLE PRECISION LNZ(*) C C ---------------- C LOCAL VARIABLES. C ---------------- C INTEGER FJCOL , FKCOL , I , ILEN , ILPNT , & INDDIF, JLEN , JLPNT , JSUP , JXPNT , & KFIRST, KLAST , KLEN , KLPNT , KSUP , & KXPNT , LJCOL , NCOLUP, NJCOLS, NKCOLS, & NXKSUP, NXTCOL, NXTSUP, STORE DOUBLE PRECISION TEMP(TMPSIZ) C RF: put TEMP(*) into a local variable DOUBLE PRECISION MXDIAG INTEGER NTINY C********************************************************************* C IFLAG = 0 NTINY = 0 NXTCOL = 0 C C ----------------------------------------------------------- C INITIALIZE EMPTY ROW LISTS IN LINK(*) AND ZERO OUT TEMP(*). C ----------------------------------------------------------- DO 100 JSUP = 1, NSUPER LINK(JSUP) = 0 100 CONTINUE DO 200 I = 1, TMPSIZ TEMP(I) = 0.0D+00 200 CONTINUE C COMPUTE MAXIMUM DIAGONAL ELEMENT IN INPUT MATRIX MXDIAG = 0.D0 DO 201 I = 1, XSUPER(NSUPER+1)-1 FJCOL = XLNZ(I) MXDIAG = MAX(MXDIAG, LNZ(FJCOL)) 201 CONTINUE C C --------------------------- C FOR EACH SUPERNODE JSUP ... C --------------------------- DO 600 JSUP = 1, NSUPER C C ------------------------------------------------ C FJCOL ... FIRST COLUMN OF SUPERNODE JSUP. C LJCOL ... LAST COLUMN OF SUPERNODE JSUP. C NJCOLS ... NUMBER OF COLUMNS IN SUPERNODE JSUP. C JLEN ... LENGTH OF COLUMN FJCOL. C JXPNT ... POINTER TO INDEX OF FIRST C NONZERO IN COLUMN FJCOL. C ------------------------------------------------ FJCOL = XSUPER(JSUP) NJCOLS = XSUPER(JSUP+1) - FJCOL LJCOL = FJCOL + NJCOLS - 1 JLEN = XLNZ(FJCOL+1) - XLNZ(FJCOL) JXPNT = XLINDX(JSUP) C print *, 'Super Node: ', JSUP, ' first: ', FJCOL, C . ' last: ', LJCOL C C C ----------------------------------------------------- C SET UP INDMAP(*) TO MAP THE ENTRIES IN UPDATE COLUMNS C TO THEIR CORRESPONDING POSITIONS IN UPDATED COLUMNS, C RELATIVE THE THE BOTTOM OF EACH UPDATED COLUMN. C ----------------------------------------------------- CALL LDINDX ( JLEN, LINDX(JXPNT), INDMAP ) C C ----------------------------------------- C FOR EVERY SUPERNODE KSUP IN ROW(JSUP) ... C ----------------------------------------- KSUP = LINK(JSUP) 300 IF ( KSUP .GT. 0 ) THEN NXKSUP = LINK(KSUP) C C ------------------------------------------------------- C GET INFO ABOUT THE CMOD(JSUP,KSUP) UPDATE. C C FKCOL ... FIRST COLUMN OF SUPERNODE KSUP. C NKCOLS ... NUMBER OF COLUMNS IN SUPERNODE KSUP. C KLEN ... LENGTH OF ACTIVE PORTION OF COLUMN FKCOL. C KXPNT ... POINTER TO INDEX OF FIRST NONZERO IN ACTIVE C PORTION OF COLUMN FJCOL. C ------------------------------------------------------- FKCOL = XSUPER(KSUP) NKCOLS = XSUPER(KSUP+1) - FKCOL KLEN = LENGTH(KSUP) KXPNT = XLINDX(KSUP+1) - KLEN C C ------------------------------------------- C PERFORM CMOD(JSUP,KSUP), WITH SPECIAL CASES C HANDLED DIFFERENTLY. C ------------------------------------------- C IF ( KLEN .NE. JLEN ) THEN C C ------------------------------------------- C SPARSE CMOD(JSUP,KSUP). C C NCOLUP ... NUMBER OF COLUMNS TO BE UPDATED. C ------------------------------------------- C DO 400 I = 0, KLEN-1 NXTCOL = LINDX(KXPNT+I) IF ( NXTCOL .GT. LJCOL ) GO TO 500 400 CONTINUE I = KLEN 500 CONTINUE NCOLUP = I C IF ( NKCOLS .EQ. 1 ) THEN C C ---------------------------------------------- C UPDATING TARGET SUPERNODE BY TRIVIAL C SUPERNODE (WITH ONE COLUMN). C C KLPNT ... POINTER TO FIRST NONZERO IN ACTIVE C PORTION OF COLUMN FKCOL. C ---------------------------------------------- KLPNT = XLNZ(FKCOL+1) - KLEN CALL MMPYI ( KLEN, NCOLUP, LINDX(KXPNT), & LNZ(KLPNT), XLNZ, LNZ, INDMAP ) C ELSE C C -------------------------------------------- C KFIRST ... FIRST INDEX OF ACTIVE PORTION OF C SUPERNODE KSUP (FIRST COLUMN TO C BE UPDATED). C KLAST ... LAST INDEX OF ACTIVE PORTION OF C SUPERNODE KSUP. C -------------------------------------------- C KFIRST = LINDX(KXPNT) KLAST = LINDX(KXPNT+KLEN-1) INDDIF = INDMAP(KFIRST) - INDMAP(KLAST) C IF ( INDDIF .LT. KLEN ) THEN C C --------------------------------------- C DENSE CMOD(JSUP,KSUP). C C ILPNT ... POINTER TO FIRST NONZERO IN C COLUMN KFIRST. C ILEN ... LENGTH OF COLUMN KFIRST. C --------------------------------------- ILPNT = XLNZ(KFIRST) ILEN = XLNZ(KFIRST+1) - ILPNT CALL MMPY ( KLEN, NKCOLS, NCOLUP, & SPLIT(FKCOL), XLNZ(FKCOL), & LNZ, LNZ(ILPNT), ILEN ) C ELSE C C ------------------------------- C GENERAL SPARSE CMOD(JSUP,KSUP). C COMPUTE CMOD(JSUP,KSUP) UPDATE C IN WORK STORAGE. C ------------------------------- STORE = KLEN * NCOLUP - NCOLUP * & (NCOLUP-1) / 2 IF ( STORE .GT. TMPSIZ ) THEN IFLAG = -2 RETURN ENDIF CALL MMPY ( KLEN, NKCOLS, NCOLUP, & SPLIT(FKCOL), XLNZ(FKCOL), & LNZ, TEMP, KLEN ) C ---------------------------------------- C GATHER INDICES OF KSUP RELATIVE TO JSUP. C ---------------------------------------- CALL IGATHR ( KLEN, LINDX(KXPNT), & INDMAP, RELIND ) C -------------------------------------- C INCORPORATE THE CMOD(JSUP,KSUP) BLOCK C UPDATE INTO THE TO APPROPRIATE COLUMNS C OF L. C -------------------------------------- CALL ASSMB ( KLEN, NCOLUP, TEMP, RELIND, & XLNZ(FJCOL), LNZ, JLEN ) C ENDIF C ENDIF C ELSE C C ---------------------------------------------- C DENSE CMOD(JSUP,KSUP). C JSUP AND KSUP HAVE IDENTICAL STRUCTURE. C C JLPNT ... POINTER TO FIRST NONZERO IN COLUMN C FJCOL. C ---------------------------------------------- JLPNT = XLNZ(FJCOL) CALL MMPY ( KLEN, NKCOLS, NJCOLS, SPLIT(FKCOL), & XLNZ(FKCOL), LNZ, LNZ(JLPNT), JLEN) NCOLUP = NJCOLS IF ( KLEN .GT. NJCOLS ) THEN NXTCOL = LINDX(JXPNT+NJCOLS) ENDIF C ENDIF C C ------------------------------------------------ C LINK KSUP INTO LINKED LIST OF THE NEXT SUPERNODE C IT WILL UPDATE AND DECREMENT KSUP'S ACTIVE C LENGTH. C ------------------------------------------------ IF ( KLEN .GT. NCOLUP ) THEN NXTSUP = SNODE(NXTCOL) LINK(KSUP) = LINK(NXTSUP) LINK(NXTSUP) = KSUP LENGTH(KSUP) = KLEN - NCOLUP ELSE LENGTH(KSUP) = 0 ENDIF C C ------------------------------- C NEXT UPDATING SUPERNODE (KSUP). C ------------------------------- KSUP = NXKSUP GO TO 300 C ENDIF C C ---------------------------------------------- C APPLY PARTIAL CHOLESKY TO THE COLUMNS OF JSUP. C ---------------------------------------------- CxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPC CALL CHLSUP ( JLEN, NJCOLS, SPLIT(FJCOL), XLNZ(FJCOL), LNZ, & MXDIAG, NTINY ) IF ( IFLAG .NE. 0 ) THEN IFLAG = -1 RETURN ENDIF C C ----------------------------------------------- C INSERT JSUP INTO LINKED LIST OF FIRST SUPERNODE C IT WILL UPDATE. C ----------------------------------------------- IF ( JLEN .GT. NJCOLS ) THEN NXTCOL = LINDX(JXPNT+NJCOLS) NXTSUP = SNODE(NXTCOL) LINK(JSUP) = LINK(NXTSUP) LINK(NXTSUP) = JSUP LENGTH(JSUP) = JLEN - NJCOLS ELSE LENGTH(JSUP) = 0 ENDIF C 600 CONTINUE C CxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPC C IF(NTINY .NE. 0) WRITE(6,699) NTINY C 699 FORMAT(1X,' FOUND ',I6,' TINY DIAGONALS; REPLACED WITH INF') C C SET IFLAG TO -1 TO INDICATE PRESENCE OF TINY DIAGONALS C IF(NTINY .NE. 0) IFLAG = -1 CxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPC RETURN END C*********************************************************************** C*********************************************************************** C C Written: October 6, 1996 by SJW. Based on routine BLKSLV of C Esmond G. Ng and Barry W. Peyton. C C Modified: Sept 30, 1999 to improve efficiency in the case C in which the right-hand side and solution are both C expected to be sparse. Happens a lot in "dense" C column handling. C C*********************************************************************** C*********************************************************************** C********* BLKSLB ... BACK TRIANGULAR SUBSTITUTION ********** C*********************************************************************** C*********************************************************************** C C PURPOSE: C GIVEN THE CHOLESKY FACTORIZATION OF A SPARSE SYMMETRIC C POSITIVE DEFINITE MATRIX, THIS SUBROUTINE PERFORMS THE C BACKWARD TRIANGULAR SUBSTITUTION. IT USES OUTPUT FROM BLKFCT. C C INPUT PARAMETERS: C NSUPER - NUMBER OF SUPERNODES. C XSUPER - SUPERNODE PARTITION. C (XLINDX,LINDX) - ROW INDICES FOR EACH SUPERNODE. C (XLNZ,LNZ) - CHOLESKY FACTOR. C C UPDATED PARAMETERS: C RHS - ON INPUT, CONTAINS THE RIGHT HAND SIDE. ON C OUTPUT, CONTAINS THE SOLUTION. C C*********************************************************************** C SUBROUTINE BLKSLB ( NSUPER, XSUPER, XLINDX, LINDX , XLNZ , & LNZ , RHS ) C C*********************************************************************** C INTEGER NSUPER INTEGER LINDX(*) , XSUPER(*) INTEGER XLINDX(*) , XLNZ(*) DOUBLE PRECISION LNZ(*) , RHS(*) C C*********************************************************************** C INTEGER FJCOL , I , IPNT , IX , IXSTOP, & IXSTRT, JCOL , JPNT , JSUP , LJCOL DOUBLE PRECISION T C C*********************************************************************** C IF ( NSUPER .LE. 0 ) RETURN C ------------------------- C BACKWARD SUBSTITUTION ... C ------------------------- LJCOL = XSUPER(NSUPER+1) - 1 DO 600 JSUP = NSUPER, 1, -1 FJCOL = XSUPER(JSUP) IXSTOP = XLNZ(LJCOL+1) - 1 JPNT = XLINDX(JSUP) + (LJCOL - FJCOL) DO 500 JCOL = LJCOL, FJCOL, -1 IXSTRT = XLNZ(JCOL) IPNT = JPNT + 1 T = RHS(JCOL) CDIR$ IVDEP DO 400 IX = IXSTRT+1, IXSTOP I = LINDX(IPNT) IF(RHS(I) .NE. 0.D0) T = T - LNZ(IX)*RHS(I) IPNT = IPNT + 1 400 CONTINUE IF(T .NE. 0.D0) THEN RHS(JCOL) = T/LNZ(IXSTRT) ELSE RHS(JCOL) = 0.D0 ENDIF IXSTOP = IXSTRT - 1 JPNT = JPNT - 1 500 CONTINUE LJCOL = FJCOL - 1 600 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Written: October 6, 1996 by SJW. Based on routine BLKSLV of C Esmond G. Ng and Barry W. Peyton. C C Modified: Sept 30, 1999 to improve efficiency in the case C in which the right-hand side and solution are both C expected to be sparse. Happens a lot in "dense" C column handling. C C*********************************************************************** C*********************************************************************** C********* BLKSLF ... FORWARD TRIANGULAR SUBSTITUTION ********** C*********************************************************************** C*********************************************************************** C C PURPOSE: C GIVEN THE CHOLESKY FACTORIZATION OF A SPARSE SYMMETRIC C POSITIVE DEFINITE MATRIX, THIS SUBROUTINE PERFORMS THE C FORWARD TRIANGULAR SUBSTITUTIOn. IT USES OUTPUT FROM BLKFCT. C C INPUT PARAMETERS: C NSUPER - NUMBER OF SUPERNODES. C XSUPER - SUPERNODE PARTITION. C (XLINDX,LINDX) - ROW INDICES FOR EACH SUPERNODE. C (XLNZ,LNZ) - CHOLESKY FACTOR. C C UPDATED PARAMETERS: C RHS - ON INPUT, CONTAINS THE RIGHT HAND SIDE. ON C OUTPUT, CONTAINS THE SOLUTION. C C*********************************************************************** C SUBROUTINE BLKSLF ( NSUPER, XSUPER, XLINDX, LINDX , XLNZ , & LNZ , RHS ) C C*********************************************************************** C INTEGER NSUPER INTEGER LINDX(*) , XSUPER(*) INTEGER XLINDX(*) , XLNZ(*) DOUBLE PRECISION LNZ(*) , RHS(*) C C*********************************************************************** C INTEGER FJCOL , I , IPNT , IX , IXSTOP, & IXSTRT, JCOL , JPNT , JSUP , LJCOL DOUBLE PRECISION T C C*********************************************************************** C IF ( NSUPER .LE. 0 ) RETURN C C ------------------------ C FORWARD SUBSTITUTION ... C ------------------------ FJCOL = XSUPER(1) DO 300 JSUP = 1, NSUPER LJCOL = XSUPER(JSUP+1) - 1 IXSTRT = XLNZ(FJCOL) JPNT = XLINDX(JSUP) DO 200 JCOL = FJCOL, LJCOL IXSTOP = XLNZ(JCOL+1) - 1 IF(RHS(JCOL) .NE. 0.D0) THEN T = RHS(JCOL)/LNZ(IXSTRT) RHS(JCOL) = T IPNT = JPNT + 1 CDIR$ IVDEP DO 100 IX = IXSTRT+1, IXSTOP I = LINDX(IPNT) RHS(I) = RHS(I) - T*LNZ(IX) IPNT = IPNT + 1 100 CONTINUE ENDIF IXSTRT = IXSTOP + 1 JPNT = JPNT + 1 200 CONTINUE FJCOL = LJCOL + 1 300 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C Modified: Sept 30, 1999 to improve efficiency in the case C in which the right-hand side and solution are both C expected to be sparse. Happens a lot in "dense" C column handling. C C*********************************************************************** C*********************************************************************** C********* BLKSLV ... BLOCK TRIANGULAR SOLUTIONS ********** C*********************************************************************** C*********************************************************************** C C PURPOSE: C GIVEN THE CHOLESKY FACTORIZATION OF A SPARSE SYMMETRIC C POSITIVE DEFINITE MATRIX, THIS SUBROUTINE PERFORMS THE C TRIANGULAR SOLUTION. IT USES OUTPUT FROM BLKFCT. C C INPUT PARAMETERS: C NSUPER - NUMBER OF SUPERNODES. C XSUPER - SUPERNODE PARTITION. C (XLINDX,LINDX) - ROW INDICES FOR EACH SUPERNODE. C (XLNZ,LNZ) - CHOLESKY FACTOR. C C UPDATED PARAMETERS: C RHS - ON INPUT, CONTAINS THE RIGHT HAND SIDE. ON C OUTPUT, CONTAINS THE SOLUTION. C C*********************************************************************** C SUBROUTINE BLKSLV ( NSUPER, XSUPER, XLINDX, LINDX , XLNZ , & LNZ , RHS ) C C*********************************************************************** C INTEGER NSUPER INTEGER LINDX(*) , XSUPER(*) INTEGER XLINDX(*) , XLNZ(*) DOUBLE PRECISION LNZ(*) , RHS(*) C C*********************************************************************** C INTEGER FJCOL , I , IPNT , IX , IXSTOP, & IXSTRT, JCOL , JPNT , JSUP , LJCOL DOUBLE PRECISION T C C*********************************************************************** C IF ( NSUPER .LE. 0 ) RETURN C C ------------------------ C FORWARD SUBSTITUTION ... C ------------------------ FJCOL = XSUPER(1) DO 300 JSUP = 1, NSUPER LJCOL = XSUPER(JSUP+1) - 1 IXSTRT = XLNZ(FJCOL) JPNT = XLINDX(JSUP) C print *, "JSUP", JSUP, FJCOL, LJCOL DO 200 JCOL = FJCOL, LJCOL IXSTOP = XLNZ(JCOL+1) - 1 C print *, JSUP, JCOL C print *, RHS(JCOL) IF(RHS(JCOL) .NE. 0.D0) THEN T = RHS(JCOL)/LNZ(IXSTRT) RHS(JCOL) = T IPNT = JPNT + 1 CDIR$ IVDEP DO 100 IX = IXSTRT+1, IXSTOP I = LINDX(IPNT) RHS(I) = RHS(I) - T*LNZ(IX) IPNT = IPNT + 1 100 CONTINUE ENDIF IXSTRT = IXSTOP + 1 JPNT = JPNT + 1 200 CONTINUE FJCOL = LJCOL + 1 300 CONTINUE C C ------------------------- C BACKWARD SUBSTITUTION ... C ------------------------- LJCOL = XSUPER(NSUPER+1) - 1 DO 600 JSUP = NSUPER, 1, -1 FJCOL = XSUPER(JSUP) IXSTOP = XLNZ(LJCOL+1) - 1 JPNT = XLINDX(JSUP) + (LJCOL - FJCOL) DO 500 JCOL = LJCOL, FJCOL, -1 IXSTRT = XLNZ(JCOL) IPNT = JPNT + 1 T = RHS(JCOL) CDIR$ IVDEP DO 400 IX = IXSTRT+1, IXSTOP I = LINDX(IPNT) IF(RHS(I) .NE. 0.D0) T = T - LNZ(IX)*RHS(I) IPNT = IPNT + 1 400 CONTINUE IF(T .NE. 0.D0) THEN RHS(JCOL) = T/LNZ(IXSTRT) ELSE RHS(JCOL) = 0.D0 ENDIF IXSTOP = IXSTRT - 1 JPNT = JPNT - 1 500 CONTINUE LJCOL = FJCOL - 1 600 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: January 12, 1995 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C****** BTREE2 ..... BINARY TREE REPRESENTATION OF ETREE ******* C*********************************************************************** C*********************************************************************** C C PURPOSE: C TO DETERMINE A BINARY TREE REPRESENTATION OF THE ELIMINATION C TREE, FOR WHICH EVERY "LAST CHILD" HAS THE MAXIMUM POSSIBLE C COLUMN NONZERO COUNT IN THE FACTOR. THE RETURNED REPRESENTATION C WILL BE GIVEN BY THE FIRST-SON AND BROTHER VECTORS. THE ROOT OF C THE BINARY TREE IS ALWAYS NEQNS. C C INPUT PARAMETERS: C NEQNS - NUMBER OF EQUATIONS. C PARENT - THE PARENT VECTOR OF THE ELIMINATION TREE. C IT IS ASSUMED THAT PARENT(I) > I EXCEPT OF C THE ROOTS. C COLCNT - COLUMN NONZERO COUNTS OF THE FACTOR. C C OUTPUT PARAMETERS: C FSON - THE FIRST SON VECTOR. C BROTHR - THE BROTHER VECTOR. C C WORKING PARAMETERS: C LSON - LAST SON VECTOR. C C*********************************************************************** C SUBROUTINE BTREE2 ( NEQNS , PARENT, COLCNT, FSON , BROTHR, & LSON ) C C*********************************************************************** C INTEGER BROTHR(*) , COLCNT(*) , & FSON(*) , LSON(*) , & PARENT(*) C INTEGER NEQNS C C*********************************************************************** C INTEGER(4) LROOT , NODE , NDLSON, NDPAR C C*********************************************************************** C IF ( NEQNS .LE. 0 ) RETURN C DO 100 NODE = 1, NEQNS FSON(NODE) = 0 BROTHR(NODE) = 0 LSON(NODE) = 0 100 CONTINUE LROOT = NEQNS C ------------------------------------------------------------ C FOR EACH NODE := NEQNS-1 STEP -1 DOWNTO 1, DO THE FOLLOWING. C ------------------------------------------------------------ IF ( NEQNS .LE. 1 ) RETURN DO 300 NODE = NEQNS-1, 1, -1 NDPAR = PARENT(NODE) IF ( NDPAR .LE. 0 .OR. NDPAR .EQ. NODE ) THEN C ------------------------------------------------- C NODE HAS NO PARENT. GIVEN STRUCTURE IS A FOREST. C SET NODE TO BE ONE OF THE ROOTS OF THE TREES. C ------------------------------------------------- BROTHR(LROOT) = NODE LROOT = NODE ELSE C ------------------------------------------- C OTHERWISE, BECOMES FIRST SON OF ITS PARENT. C ------------------------------------------- NDLSON = LSON(NDPAR) IF ( NDLSON .NE. 0 ) THEN IF ( COLCNT(NODE) .GE. COLCNT(NDLSON) ) THEN BROTHR(NODE) = FSON(NDPAR) FSON(NDPAR) = NODE ELSE BROTHR(NDLSON) = NODE LSON(NDPAR) = NODE ENDIF ELSE FSON(NDPAR) = NODE LSON(NDPAR) = NODE ENDIF ENDIF 300 CONTINUE BROTHR(LROOT) = 0 C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.3 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Modified by RF: Eliminated the MMPYN, SMXPY as arguments C C Mathematical Sciences Section, Oak Ridge National Laboratoy C C*********************************************************************** C*********************************************************************** C****** CHLSUP .... DENSE CHOLESKY WITHIN SUPERNODE ************** C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE PERFORMS CHOLESKY C FACTORIZATION ON THE COLUMNS OF A SUPERNODE C THAT HAVE RECEIVED ALL UPDATES FROM COLUMNS C EXTERNAL TO THE SUPERNODE. C C INPUT PARAMETERS - C M - NUMBER OF ROWS (LENGTH OF THE FIRST COLUMN). C N - NUMBER OF COLUMNS IN THE SUPERNODE. C XPNT - XPNT(J+1) POINTS ONE LOCATION BEYOND THE END C OF THE J-TH COLUMN OF THE SUPERNODE. C X(*) - CONTAINS THE COLUMNS OF OF THE SUPERNODE TO C BE FACTORED. C C EXTERNAL ROUTINES - C MMPY8 - MATRIX-MATRIX MULTIPLY WITH 8 LOOP UNROLLING. C C OUTPUT PARAMETERS - C X(*) - ON OUTPUT, CONTAINS THE FACTORED COLUMNS OF C THE SUPERNODE. C IFLAG - UNCHANGED IF THERE IS NO ERROR. C =1 IF NONPOSITIVE DIAGONAL ENTRY IS ENCOUNTERED. C C*********************************************************************** C SUBROUTINE CHLSUP ( M, N, SPLIT, XPNT, X, MXDIAG, NTINY & ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- C EXTERNAL MMPY8 C INTEGER M, N C INTEGER XPNT(*), SPLIT(*) C DOUBLE PRECISION X(*), MXDIAG INTEGER NTINY C C ---------------- C LOCAL VARIABLES. C ---------------- C INTEGER FSTCOL, JBLK , JPNT , MM , NN , & NXTCOL, Q C C*********************************************************************** C JBLK = 0 FSTCOL = 1 MM = M JPNT = XPNT(FSTCOL) C C ---------------------------------------- C FOR EACH BLOCK JBLK IN THE SUPERNODE ... C ---------------------------------------- 100 CONTINUE IF ( FSTCOL .LE. N ) THEN JBLK = JBLK + 1 NN = SPLIT(JBLK) C ------------------------------------------ C ... PERFORM PARTIAL CHOLESKY FACTORIZATION C ON THE BLOCK. C ------------------------------------------ CALL PCHOL ( MM, NN, XPNT(FSTCOL), X, MXDIAG, NTINY) C ---------------------------------------------- C ... APPLY THE COLUMNS IN JBLK TO ANY COLUMNS C OF THE SUPERNODE REMAINING TO BE COMPUTED. C ---------------------------------------------- NXTCOL = FSTCOL + NN Q = N - NXTCOL + 1 MM = MM - NN JPNT = XPNT(NXTCOL) IF ( Q .GT. 0 ) THEN CALL MMPY8( MM, NN, Q, XPNT(FSTCOL), X, X(JPNT), MM ) ENDIF FSTCOL = NXTCOL GO TO 100 ENDIF C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C********** CHORDR ..... CHILD REORDERING *********** C*********************************************************************** C*********************************************************************** C C PURPOSE: C REARRANGE THE CHILDREN OF EACH VERTEX SO THAT THE LAST ONE C MAXIMIZES (AMONG THE CHILDREN) THE NUMBER OF NONZEROS IN THE C CORRESPONDING COLUMN OF L. ALSO DETERMINE AN NEW POSTORDERING C BASED ON THE STRUCTURE OF THE MODIFIED ELIMINATION TREE. C C INPUT PARAMETERS: C NEQNS - NUMBER OF EQUATIONS. C C UPDATED PARAMETERS: C (PERM,INVP) - ON INPUT, THE GIVEN PERM AND INVERSE PERM C VECTORS. ON OUTPUT, THE NEW PERM AND C INVERSE PERM VECTORS OF THE NEW C POSTORDERING. C COLCNT - COLUMN COUNTS IN L UNDER INITIAL ORDERING; C MODIFIED TO REFLECT THE NEW ORDERING. C C OUTPUT PARAMETERS: C PARENT - THE PARENT VECTOR OF THE ELIMINATION TREE C ASSOCIATED WITH THE NEW ORDERING. C C WORKING PARAMETERS: C FSON - THE FIRST SON VECTOR. C BROTHR - THE BROTHER VECTOR. C INVPOS - THE INVERSE PERM VECTOR FOR THE C POSTORDERING. C C PROGRAM SUBROUTINES: C BTREE2, EPOST2, INVINV. C C*********************************************************************** C SUBROUTINE CHORDR ( NEQNS , PERM , INVP , & COLCNT, PARENT, FSON , BROTHR, INVPOS ) C C*********************************************************************** C INTEGER BROTHR(*) , & COLCNT(*) , FSON(*) , & INVP(*) , INVPOS(*) , & PARENT(*) , PERM(*) C INTEGER NEQNS C C*********************************************************************** C C ---------------------------------------------------------- C COMPUTE A BINARY REPRESENTATION OF THE ELIMINATION TREE, C SO THAT EACH "LAST CHILD" MAXIMIZES AMONG ITS SIBLINGS THE C NUMBER OF NONZEROS IN THE CORRESPONDING COLUMNS OF L. C ---------------------------------------------------------- CALL BTREE2 ( NEQNS , PARENT, COLCNT, FSON , BROTHR, & INVPOS ) C C ---------------------------------------------------- C POSTORDER THE ELIMINATION TREE (USING THE NEW BINARY C REPRESENTATION. C ---------------------------------------------------- CALL EPOST2 ( NEQNS , FSON , BROTHR, INVPOS, PARENT, & COLCNT, PERM ) C C -------------------------------------------------------- C COMPOSE THE ORIGINAL ORDERING WITH THE NEW POSTORDERING. C -------------------------------------------------------- CALL INVINV ( NEQNS , INVP , INVPOS, PERM ) C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C****** DSCAL1 .... SCALE A VECTOR ************** C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE COMPUTES A <-- AX, WHERE A IS A C SCALAR AND X IS A VECTOR. C C INPUT PARAMETERS - C N - LENGTH OF THE VECTOR X. C A - SCALAR MULIPLIER. C X - VECTOR TO BE SCALED. C C OUTPUT PARAMETERS - C X - REPLACED BY THE SCALED VECTOR, AX. C C*********************************************************************** C SUBROUTINE DSCAL1 ( N, A, X ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- INTEGER N DOUBLE PRECISION A, X(N) C C ---------------- C LOCAL VARIABLES. C ---------------- INTEGER I C C*********************************************************************** C DO 100 I = 1, N X(I) = A * X(I) 100 CONTINUE RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C*************** EPOST2 ..... ETREE POSTORDERING #2 *************** C*********************************************************************** C*********************************************************************** C C PURPOSE: C BASED ON THE BINARY REPRESENTATION (FIRST-SON,BROTHER) OF THE C ELIMINATION TREE, A POSTORDERING IS DETERMINED. THE C CORRESPONDING PARENT AND COLCNT VECTORS ARE ALSO MODIFIED TO C REFLECT THE REORDERING. C C INPUT PARAMETERS: C ROOT - ROOT OF THE ELIMINATION TREE (USUALLY IT C IS NEQNS). C FSON - THE FIRST SON VECTOR. C BROTHR - THE BROTHR VECTOR. C C UPDATED PARAMETERS: C PARENT - THE PARENT VECTOR. C COLCNT - COLUMN NONZERO COUNTS OF THE FACTOR. C C OUTPUT PARAMETERS: C INVPOS - INVERSE PERMUTATION FOR THE POSTORDERING. C C WORKING PARAMETERS: C STACK - THE STACK FOR POSTORDER TRAVERSAL OF THE C TREE. C C*********************************************************************** C SUBROUTINE EPOST2 ( ROOT , FSON , BROTHR, INVPOS, PARENT, & COLCNT, STACK ) C C*********************************************************************** C INTEGER(4) BROTHR(*) , COLCNT(*) , & FSON(*) , INVPOS(*) , & PARENT(*) , STACK(*) C INTEGER(4) ROOT C C*********************************************************************** C INTEGER(4) ITOP , NDPAR , NODE , NUM , NUNODE C C*********************************************************************** C NUM = 0 ITOP = 0 NODE = ROOT C ------------------------------------------------------------- C TRAVERSE ALONG THE FIRST SONS POINTER AND PUSH THE TREE NODES C ALONG THE TRAVERSAL INTO THE STACK. C ------------------------------------------------------------- 100 CONTINUE ITOP = ITOP + 1 STACK(ITOP) = NODE NODE = FSON(NODE) IF ( NODE .GT. 0 ) GO TO 100 C ---------------------------------------------------------- C IF POSSIBLE, POP A TREE NODE FROM THE STACK AND NUMBER IT. C ---------------------------------------------------------- 200 CONTINUE IF ( ITOP .LE. 0 ) GO TO 300 NODE = STACK(ITOP) ITOP = ITOP - 1 NUM = NUM + 1 INVPOS(NODE) = NUM C ---------------------------------------------------- C THEN, TRAVERSE TO ITS YOUNGER BROTHER IF IT HAS ONE. C ---------------------------------------------------- NODE = BROTHR(NODE) IF ( NODE .LE. 0 ) GO TO 200 GO TO 100 C 300 CONTINUE C ------------------------------------------------------------ C DETERMINE THE NEW PARENT VECTOR OF THE POSTORDERING. BROTHR C IS USED TEMPORARILY FOR THE NEW PARENT VECTOR. C ------------------------------------------------------------ DO 400 NODE = 1, NUM NUNODE = INVPOS(NODE) NDPAR = PARENT(NODE) IF ( NDPAR .GT. 0 ) NDPAR = INVPOS(NDPAR) BROTHR(NUNODE) = NDPAR 400 CONTINUE C DO 500 NUNODE = 1, NUM PARENT(NUNODE) = BROTHR(NUNODE) 500 CONTINUE C C ---------------------------------------------- C PERMUTE COLCNT(*) TO REFLECT THE NEW ORDERING. C ---------------------------------------------- DO 600 NODE = 1, NUM NUNODE = INVPOS(NODE) STACK(NUNODE) = COLCNT(NODE) 600 CONTINUE C DO 700 NODE = 1, NUM COLCNT(NODE) = STACK(NODE) 700 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Joseph W.H. Liu C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C********** ETORDR ..... ELIMINATION TREE REORDERING *********** C*********************************************************************** C*********************************************************************** C C WRITTEN BY JOSEPH LIU (JUL 17, 1985) C C PURPOSE: C TO DETERMINE AN EQUIVALENT REORDERING BASED ON THE STRUCTURE OF C THE ELIMINATION TREE. A POSTORDERING OF THE GIVEN ELIMINATION C TREE IS RETURNED. C C INPUT PARAMETERS: C NEQNS - NUMBER OF EQUATIONS. C (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. C C UPDATED PARAMETERS: C (PERM,INVP) - ON INPUT, THE GIVEN PERM AND INVERSE PERM C VECTORS. ON OUTPUT, THE NEW PERM AND C INVERSE PERM VECTORS OF THE EQUIVALENT C ORDERING. C C OUTPUT PARAMETERS: C PARENT - THE PARENT VECTOR OF THE ELIMINATION TREE C ASSOCIATED WITH THE NEW ORDERING. C C WORKING PARAMETERS: C FSON - THE FIRST SON VECTOR. C BROTHR - THE BROTHER VECTOR. C INVPOS - THE INVERSE PERM VECTOR FOR THE C POSTORDERING. C C PROGRAM SUBROUTINES: C BETREE, ETPOST, ETREE , INVINV. C C*********************************************************************** C SUBROUTINE ETORDR ( NEQNS , XADJ , ADJNCY, PERM , INVP , & PARENT, FSON , BROTHR, INVPOS ) C C*********************************************************************** C INTEGER(4) ADJNCY(*) , BROTHR(*) , & FSON(*) , INVP(*) , & INVPOS(*) , PARENT(*) , & PERM(*) C INTEGER(4) XADJ(*) INTEGER(4) NEQNS C C*********************************************************************** C C ----------------------------- C COMPUTE THE ELIMINATION TREE. C ----------------------------- CALL ETREE ( NEQNS, XADJ, ADJNCY, PERM, INVP, PARENT, INVPOS ) C C -------------------------------------------------------- C COMPUTE A BINARY REPRESENTATION OF THE ELIMINATION TREE. C -------------------------------------------------------- CALL BETREE ( NEQNS, PARENT, FSON, BROTHR ) C C ------------------------------- C POSTORDER THE ELIMINATION TREE. C ------------------------------- CALL ETPOST ( NEQNS, FSON, BROTHR, INVPOS, PARENT, PERM ) C C -------------------------------------------------------- C COMPOSE THE ORIGINAL ORDERING WITH THE NEW POSTORDERING. C -------------------------------------------------------- CALL INVINV ( NEQNS, INVP, INVPOS, PERM ) C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Joseph W.H. Liu C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C*************** ETPOST ..... ETREE POSTORDERING *************** C*********************************************************************** C*********************************************************************** C C WRITTEN BY JOSEPH LIU (SEPT 17, 1986) C C PURPOSE: C BASED ON THE BINARY REPRESENTATION (FIRST-SON,BROTHER) OF C THE ELIMINATION TREE, A POSTORDERING IS DETERMINED. THE C CORRESPONDING PARENT VECTOR IS ALSO MODIFIED TO REFLECT C THE REORDERING. C C INPUT PARAMETERS: C ROOT - ROOT OF THE ELIMINATION TREE (USUALLY IT C IS NEQNS). C FSON - THE FIRST SON VECTOR. C BROTHR - THE BROTHR VECTOR. C C UPDATED PARAMETERS: C PARENT - THE PARENT VECTOR. C C OUTPUT PARAMETERS: C INVPOS - INVERSE PERMUTATION FOR THE POSTORDERING. C C WORKING PARAMETERS: C STACK - THE STACK FOR POSTORDER TRAVERSAL OF THE C TREE. C C*********************************************************************** C SUBROUTINE ETPOST ( ROOT , FSON , BROTHR, INVPOS, PARENT, & STACK ) C C*********************************************************************** C INTEGER(4) BROTHR(*) , FSON(*) , & INVPOS(*) , PARENT(*) , & STACK(*) C INTEGER(4) ROOT C C*********************************************************************** C INTEGER(4) ITOP , NDPAR , NODE , NUM , NUNODE C C*********************************************************************** C NUM = 0 ITOP = 0 NODE = ROOT C ------------------------------------------------------------- C TRAVERSE ALONG THE FIRST SONS POINTER AND PUSH THE TREE NODES C ALONG THE TRAVERSAL INTO THE STACK. C ------------------------------------------------------------- 100 CONTINUE ITOP = ITOP + 1 STACK(ITOP) = NODE NODE = FSON(NODE) IF ( NODE .GT. 0 ) GO TO 100 C ---------------------------------------------------------- C IF POSSIBLE, POP A TREE NODE FROM THE STACK AND NUMBER IT. C ---------------------------------------------------------- 200 CONTINUE IF ( ITOP .LE. 0 ) GO TO 300 NODE = STACK(ITOP) ITOP = ITOP - 1 NUM = NUM + 1 INVPOS(NODE) = NUM C ---------------------------------------------------- C THEN, TRAVERSE TO ITS YOUNGER BROTHER IF IT HAS ONE. C ---------------------------------------------------- NODE = BROTHR(NODE) IF ( NODE .LE. 0 ) GO TO 200 GO TO 100 C 300 CONTINUE C ------------------------------------------------------------ C DETERMINE THE NEW PARENT VECTOR OF THE POSTORDERING. BROTHR C IS USED TEMPORARILY FOR THE NEW PARENT VECTOR. C ------------------------------------------------------------ DO 400 NODE = 1, NUM NUNODE = INVPOS(NODE) NDPAR = PARENT(NODE) IF ( NDPAR .GT. 0 ) NDPAR = INVPOS(NDPAR) BROTHR(NUNODE) = NDPAR 400 CONTINUE C DO 500 NUNODE = 1, NUM PARENT(NUNODE) = BROTHR(NUNODE) 500 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Joseph W.H. Liu C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C**************** ETREE ..... ELIMINATION TREE ***************** C*********************************************************************** C*********************************************************************** C C WRITTEN BY JOSEPH LIU (JUL 17, 1985) C C PURPOSE: C TO DETERMINE THE ELIMINATION TREE FROM A GIVEN ORDERING AND C THE ADJACENCY STRUCTURE. THE PARENT VECTOR IS RETURNED. C C INPUT PARAMETERS: C NEQNS - NUMBER OF EQUATIONS. C (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. C (PERM,INVP) - PERMUTATION AND INVERSE PERMUTATION VECTORS C C OUTPUT PARAMETERS: C PARENT - THE PARENT VECTOR OF THE ELIMINATION TREE. C C WORKING PARAMETERS: C ANCSTR - THE ANCESTOR VECTOR. C C*********************************************************************** C SUBROUTINE ETREE ( NEQNS , XADJ , ADJNCY, PERM , INVP , & PARENT, ANCSTR ) C C*********************************************************************** C INTEGER(4) ADJNCY(*) , ANCSTR(*) , & INVP(*) , PARENT(*) , & PERM(*) C INTEGER(4) NEQNS INTEGER(4) XADJ(*) C C*********************************************************************** C INTEGER(4) I , J , JSTOP , JSTRT , NBR , & NEXT , NODE C C*********************************************************************** C IF ( NEQNS .LE. 0 ) RETURN C DO 400 I = 1, NEQNS PARENT(I) = 0 ANCSTR(I) = 0 NODE = PERM(I) C JSTRT = XADJ(NODE) JSTOP = XADJ(NODE+1) - 1 IF ( JSTRT .LE. JSTOP ) THEN DO 300 J = JSTRT, JSTOP NBR = ADJNCY(J) NBR = INVP(NBR) IF ( NBR .LT. I ) THEN C ------------------------------------------- C FOR EACH NBR, FIND THE ROOT OF ITS CURRENT C ELIMINATION TREE. PERFORM PATH COMPRESSION C AS THE SUBTREE IS TRAVERSED. C ------------------------------------------- 100 CONTINUE IF ( ANCSTR(NBR) .EQ. I ) GO TO 300 IF ( ANCSTR(NBR) .GT. 0 ) THEN NEXT = ANCSTR(NBR) ANCSTR(NBR) = I NBR = NEXT GO TO 100 ENDIF C -------------------------------------------- C NOW, NBR IS THE ROOT OF THE SUBTREE. MAKE I C THE PARENT NODE OF THIS ROOT. C -------------------------------------------- PARENT(NBR) = I ANCSTR(NBR) = I ENDIF 300 CONTINUE ENDIF 400 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: January 12, 1995 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C************** FCNTHN ..... FIND NONZERO COUNTS *************** C*********************************************************************** C*********************************************************************** C C PURPOSE: C THIS SUBROUTINE DETERMINES THE ROW COUNTS AND COLUMN COUNTS IN C THE CHOLESKY FACTOR. IT USES A DISJOINT SET UNION ALGORITHM. C C TECHNIQUES: C 1) SUPERNODE DETECTION. C 2) PATH HALVING. C 3) NO UNION BY RANK. C C NOTES: C 1) ASSUMES A POSTORDERING OF THE ELIMINATION TREE. C C INPUT PARAMETERS: C (I) NEQNS - NUMBER OF EQUATIONS. C (I) ADJLEN - LENGTH OF ADJACENCY STRUCTURE. C (I) XADJ(*) - ARRAY OF LENGTH NEQNS+1, CONTAINING POINTERS C TO THE ADJACENCY STRUCTURE. C (I) ADJNCY(*) - ARRAY OF LENGTH XADJ(NEQNS+1)-1, CONTAINING C THE ADJACENCY STRUCTURE. C (I) PERM(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE C POSTORDERING. C (I) INVP(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE C INVERSE OF THE POSTORDERING. C (I) ETPAR(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE C ELIMINATION TREE OF THE POSTORDERED MATRIX. C C OUTPUT PARAMETERS: C (I) ROWCNT(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE NUMBER C OF NONZEROS IN EACH ROW OF THE FACTOR, C INCLUDING THE DIAGONAL ENTRY. C (I) COLCNT(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE NUMBER C OF NONZEROS IN EACH COLUMN OF THE FACTOR, C INCLUDING THE DIAGONAL ENTRY. C (I) NLNZ - NUMBER OF NONZEROS IN THE FACTOR, INCLUDING C THE DIAGONAL ENTRIES. C C WORK PARAMETERS: C (I) SET(*) - ARRAY OF LENGTH NEQNS USED TO MAINTAIN THE C DISJOINT SETS (I.E., SUBTREES). C (I) PRVLF(*) - ARRAY OF LENGTH NEQNS USED TO RECORD THE C PREVIOUS LEAF OF EACH ROW SUBTREE. C (I) LEVEL(*) - ARRAY OF LENGTH NEQNS+1 CONTAINING THE LEVEL C (DISTANCE FROM THE ROOT). C (I) WEIGHT(*) - ARRAY OF LENGTH NEQNS+1 CONTAINING WEIGHTS C USED TO COMPUTE COLUMN COUNTS. C (I) FDESC(*) - ARRAY OF LENGTH NEQNS+1 CONTAINING THE C FIRST (I.E., LOWEST-NUMBERED) DESCENDANT. C (I) NCHILD(*) - ARRAY OF LENGTH NEQNS+1 CONTAINING THE C NUMBER OF CHILDREN. C (I) PRVNBR(*) - ARRAY OF LENGTH NEQNS USED TO RECORD THE C PREVIOUS ``LOWER NEIGHBOR'' OF EACH NODE. C C FIRST CREATED ON APRIL 12, 1990. C LAST UPDATED ON JANUARY 12, 1995. C C*********************************************************************** C SUBROUTINE FCNTHN ( NEQNS , ADJLEN, XADJ , ADJNCY, PERM , & INVP , ETPAR , ROWCNT, COLCNT, NLNZ , & SET , PRVLF , LEVEL , WEIGHT, FDESC , & NCHILD, PRVNBR ) C C ----------- C PARAMETERS. C ----------- INTEGER ADJLEN, NEQNS , NLNZ INTEGER ADJNCY(ADJLEN) , COLCNT(NEQNS) , & ETPAR(NEQNS) , FDESC(0:NEQNS), & INVP(NEQNS) , LEVEL(0:NEQNS), & NCHILD(0:NEQNS) , PERM(NEQNS) , & PRVLF(NEQNS) , PRVNBR(NEQNS) , & ROWCNT(NEQNS) , SET(NEQNS) , & WEIGHT(0:NEQNS) INTEGER XADJ(*) C C ---------------- C LOCAL VARIABLES. C ---------------- INTEGER HINBR , IFDESC, J , JSTOP , JSTRT , & K , LAST1 , LAST2 , LCA , LFLAG , & LOWNBR, OLDNBR, PARENT, PLEAF , TEMP , & XSUP C C*********************************************************************** C C -------------------------------------------------- C COMPUTE LEVEL(*), FDESC(*), NCHILD(*). C INITIALIZE XSUP, ROWCNT(*), COLCNT(*), C SET(*), PRVLF(*), WEIGHT(*), PRVNBR(*). C -------------------------------------------------- XSUP = 1 LEVEL(0) = 0 DO 100 K = NEQNS, 1, -1 ROWCNT(K) = 1 COLCNT(K) = 0 SET(K) = K PRVLF(K) = 0 LEVEL(K) = LEVEL(ETPAR(K)) + 1 WEIGHT(K) = 1 FDESC(K) = K NCHILD(K) = 0 PRVNBR(K) = 0 100 CONTINUE NCHILD(0) = 0 FDESC(0) = 0 DO 200 K = 1, NEQNS PARENT = ETPAR(K) WEIGHT(PARENT) = 0 NCHILD(PARENT) = NCHILD(PARENT) + 1 IFDESC = FDESC(K) IF ( IFDESC .LT. FDESC(PARENT) ) THEN FDESC(PARENT) = IFDESC ENDIF 200 CONTINUE C ------------------------------------ C FOR EACH ``LOW NEIGHBOR'' LOWNBR ... C ------------------------------------ DO 600 LOWNBR = 1, NEQNS LFLAG = 0 IFDESC = FDESC(LOWNBR) OLDNBR = PERM(LOWNBR) JSTRT = XADJ(OLDNBR) JSTOP = XADJ(OLDNBR+1) - 1 C ----------------------------------------------- C FOR EACH ``HIGH NEIGHBOR'', HINBR OF LOWNBR ... C ----------------------------------------------- DO 500 J = JSTRT, JSTOP HINBR = INVP(ADJNCY(J)) IF ( HINBR .GT. LOWNBR ) THEN IF ( IFDESC .GT. PRVNBR(HINBR) ) THEN C ------------------------- C INCREMENT WEIGHT(LOWNBR). C ------------------------- WEIGHT(LOWNBR) = WEIGHT(LOWNBR) + 1 PLEAF = PRVLF(HINBR) C ----------------------------------------- C IF HINBR HAS NO PREVIOUS ``LOW NEIGHBOR'' C THEN ... C ----------------------------------------- IF ( PLEAF .EQ. 0 ) THEN C ----------------------------------------- C ... ACCUMULATE LOWNBR-->HINBR PATH LENGTH C IN ROWCNT(HINBR). C ----------------------------------------- ROWCNT(HINBR) = ROWCNT(HINBR) + & LEVEL(LOWNBR) - LEVEL(HINBR) ELSE C ----------------------------------------- C ... OTHERWISE, LCA <-- FIND(PLEAF), WHICH C IS THE LEAST COMMON ANCESTOR OF PLEAF C AND LOWNBR. C (PATH HALVING.) C ----------------------------------------- LAST1 = PLEAF LAST2 = SET(LAST1) LCA = SET(LAST2) 300 CONTINUE IF ( LCA .NE. LAST2 ) THEN SET(LAST1) = LCA LAST1 = LCA LAST2 = SET(LAST1) LCA = SET(LAST2) GO TO 300 ENDIF C ------------------------------------- C ACCUMULATE PLEAF-->LCA PATH LENGTH IN C ROWCNT(HINBR). C DECREMENT WEIGHT(LCA). C ------------------------------------- ROWCNT(HINBR) = ROWCNT(HINBR) & + LEVEL(LOWNBR) - LEVEL(LCA) WEIGHT(LCA) = WEIGHT(LCA) - 1 ENDIF C ---------------------------------------------- C LOWNBR NOW BECOMES ``PREVIOUS LEAF'' OF HINBR. C ---------------------------------------------- PRVLF(HINBR) = LOWNBR LFLAG = 1 ENDIF C -------------------------------------------------- C LOWNBR NOW BECOMES ``PREVIOUS NEIGHBOR'' OF HINBR. C -------------------------------------------------- PRVNBR(HINBR) = LOWNBR ENDIF 500 CONTINUE C ---------------------------------------------------- C DECREMENT WEIGHT ( PARENT(LOWNBR) ). C SET ( P(LOWNBR) ) <-- SET ( P(LOWNBR) ) + SET(XSUP). C ---------------------------------------------------- PARENT = ETPAR(LOWNBR) WEIGHT(PARENT) = WEIGHT(PARENT) - 1 IF ( LFLAG .EQ. 1 .OR. & NCHILD(LOWNBR) .GE. 2 ) THEN XSUP = LOWNBR ENDIF SET(XSUP) = PARENT 600 CONTINUE C --------------------------------------------------------- C USE WEIGHTS TO COMPUTE COLUMN (AND TOTAL) NONZERO COUNTS. C --------------------------------------------------------- NLNZ = 0 DO 700 K = 1, NEQNS TEMP = COLCNT(K) + WEIGHT(K) COLCNT(K) = TEMP NLNZ = NLNZ + TEMP PARENT = ETPAR(K) IF ( PARENT .NE. 0 ) THEN COLCNT(PARENT) = COLCNT(PARENT) + TEMP ENDIF 700 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: May 26, 1995 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C**** FNSPLT ..... COMPUTE FINE PARTITIONING OF SUPERNODES ***** C*********************************************************************** C*********************************************************************** C C PURPOSE: C THIS SUBROUTINE DETERMINES A FINE PARTITIONING OF SUPERNODES C WHEN THERE IS A CACHE AVAILABLE ON THE MACHINE. THE FINE C PARTITIONING IS CHOSEN SO THAT DATA RE-USE IS MAXIMIZED. C C INPUT PARAMETERS: C NEQNS - NUMBER OF EQUATIONS. C NSUPER - NUMBER OF SUPERNODES. C XSUPER - INTEGER ARRAY OF SIZE (NSUPER+1) CONTAINING C THE SUPERNODE PARTITIONING. C XLINDX - INTEGER ARRAY OF SIZE (NSUPER+1) CONTAINING C POINTERS IN THE SUPERNODE INDICES. C CACHSZ - CACHE SIZE IN KILO BYTES. C IF THERE IS NO CACHE, SET CACHSZ = 0. C C OUTPUT PARAMETERS: C SPLIT - INTEGER ARRAY OF SIZE NEQNS CONTAINING THE C FINE PARTITIONING. C C*********************************************************************** C SUBROUTINE FNSPLT ( NEQNS , NSUPER, XSUPER, XLINDX, & CACHSZ, SPLIT ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- INTEGER CACHSZ, NEQNS , NSUPER INTEGER XSUPER(*), SPLIT(*) INTEGER XLINDX(*) C C ---------------- C LOCAL VARIABLES. C ---------------- INTEGER CACHE , CURCOL, FSTCOL, HEIGHT, KCOL , 1 KSUP , LSTCOL, NCOLS , NXTBLK, USED , 1 WIDTH C C ******************************************************************* C C -------------------------------------------- C COMPUTE THE NUMBER OF 8-BYTE WORDS IN CACHE. C -------------------------------------------- IF ( CACHSZ .LE. 0 ) THEN CACHE = 2 000 000 000 ELSE CACHE = INT(( FLOAT(CACHSZ) * 1024. / 8. ) * 0.9) ENDIF C C --------------- C INITIALIZATION. C --------------- DO 100 KCOL = 1, NEQNS SPLIT(KCOL) = 0 100 CONTINUE C C --------------------------- C FOR EACH SUPERNODE KSUP ... C --------------------------- DO 1000 KSUP = 1, NSUPER C ----------------------- C ... GET SUPERNODE INFO. C ----------------------- HEIGHT = XLINDX(KSUP+1) - XLINDX(KSUP) FSTCOL = XSUPER(KSUP) LSTCOL = XSUPER(KSUP+1) - 1 WIDTH = LSTCOL - FSTCOL + 1 NXTBLK = FSTCOL C -------------------------------------- C ... UNTIL ALL COLUMNS OF THE SUPERNODE C HAVE BEEN PROCESSED ... C -------------------------------------- CURCOL = FSTCOL - 1 200 CONTINUE C ------------------------------------------- C ... PLACE THE FIRST COLUMN(S) IN THE CACHE. C ------------------------------------------- CURCOL = CURCOL + 1 IF ( CURCOL .LT. LSTCOL ) THEN CURCOL = CURCOL + 1 NCOLS = 2 USED = 4 * HEIGHT - 1 HEIGHT = HEIGHT - 2 ELSE NCOLS = 1 USED = 3 * HEIGHT HEIGHT = HEIGHT - 1 ENDIF C C -------------------------------------- C ... WHILE THE CACHE IS NOT FILLED AND C THERE ARE COLUMNS OF THE SUPERNODE C REMAINING TO BE PROCESSED ... C -------------------------------------- 300 CONTINUE IF ( USED+HEIGHT .LT. CACHE .AND. & CURCOL .LT. LSTCOL ) THEN C -------------------------------- C ... ADD ANOTHER COLUMN TO CACHE. C -------------------------------- CURCOL = CURCOL + 1 NCOLS = NCOLS + 1 USED = USED + HEIGHT HEIGHT = HEIGHT - 1 GO TO 300 ENDIF C ------------------------------------- C ... RECORD THE NUMBER OF COLUMNS THAT C FILLED THE CACHE. C ------------------------------------- SPLIT(NXTBLK) = NCOLS NXTBLK = NXTBLK + 1 C -------------------------- C ... GO PROCESS NEXT BLOCK. C -------------------------- IF ( CURCOL .LT. LSTCOL ) GO TO 200 1000 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C****** FNTSIZ ..... COMPUTE WORK STORAGE SIZE FOR BLKFCT ****** C*********************************************************************** C*********************************************************************** C C PURPOSE: C THIS SUBROUTINE DETERMINES THE SIZE OF THE WORKING STORAGE C REQUIRED BY BLKFCT. C C INPUT PARAMETERS: C NSUPER - NUMBER OF SUPERNODES. C XSUPER - INTEGER ARRAY OF SIZE (NSUPER+1) CONTAINING C THE SUPERNODE PARTITIONING. C SNODE - SUPERNODE MEMBERSHIP. C (XLINDX,LINDX) - ARRAYS DESCRIBING THE SUPERNODAL STRUCTURE. C C OUTPUT PARAMETERS: C TMPSIZ - SIZE OF WORKING STORAGE REQUIRED BY BLKFCT. C C*********************************************************************** C SUBROUTINE FNTSIZ ( NSUPER, XSUPER, SNODE , XLINDX, & LINDX , TMPSIZ ) C C*********************************************************************** C INTEGER NSUPER, TMPSIZ INTEGER XLINDX(*) , XSUPER(*) INTEGER LINDX (*) , SNODE (*) C INTEGER BOUND , CLEN , CURSUP, I , IBEGIN, IEND , & KSUP , LENGTH, NCOLS , NXTSUP, & TSIZE , WIDTH C C*********************************************************************** C C RETURNS SIZE OF TEMP ARRAY USED BY BLKFCT FACTORIZATION ROUTINE. C NOTE THAT THE VALUE RETURNED IS AN ESTIMATE, THOUGH IT IS USUALLY C TIGHT. C C ---------------------------------------- C COMPUTE SIZE OF TEMPORARY STORAGE VECTOR C NEEDED BY BLKFCT. C ---------------------------------------- TMPSIZ = 0 DO 500 KSUP = NSUPER, 1, -1 NCOLS = XSUPER(KSUP+1) - XSUPER(KSUP) IBEGIN = XLINDX(KSUP) + NCOLS IEND = XLINDX(KSUP+1) - 1 LENGTH = IEND - IBEGIN + 1 BOUND = LENGTH * (LENGTH + 1) / 2 IF ( BOUND .GT. TMPSIZ ) THEN CURSUP = SNODE(LINDX(IBEGIN)) CLEN = XLINDX(CURSUP+1) - XLINDX(CURSUP) WIDTH = 0 DO 400 I = IBEGIN, IEND NXTSUP = SNODE(LINDX(I)) IF ( NXTSUP .EQ. CURSUP ) THEN WIDTH = WIDTH + 1 IF ( I .EQ. IEND ) THEN IF ( CLEN .GT. LENGTH ) THEN TSIZE = LENGTH * WIDTH - & (WIDTH - 1) * WIDTH / 2 TMPSIZ = MAX ( TSIZE , TMPSIZ ) ENDIF ENDIF ELSE IF ( CLEN .GT. LENGTH ) THEN TSIZE = LENGTH * WIDTH - & (WIDTH - 1) * WIDTH / 2 TMPSIZ = MAX ( TSIZE , TMPSIZ ) ENDIF LENGTH = LENGTH - WIDTH BOUND = LENGTH * (LENGTH + 1) / 2 IF ( BOUND .LE. TMPSIZ ) GO TO 500 WIDTH = 1 CURSUP = NXTSUP CLEN = XLINDX(CURSUP+1) - XLINDX(CURSUP) ENDIF 400 CONTINUE ENDIF 500 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C**************** FSUP1 ..... FIND SUPERNODES #1 ***************** C*********************************************************************** C*********************************************************************** C C PURPOSE: C THIS SUBROUTINE IS THE FIRST OF TWO ROUTINES FOR FINDING A C MAXIMAL SUPERNODE PARTITION. IT RETURNS ONLY THE NUMBER OF C SUPERNODES NSUPER AND THE SUPERNODE MEMBERSHIP VECTOR SNODE(*), C WHICH IS OF LENGTH NEQNS. THE VECTORS OF LENGTH NSUPER ARE C COMPUTED SUBSEQUENTLY BY THE COMPANION ROUTINE FSUP2. C C METHOD AND ASSUMPTIONS: C THIS ROUTINE USES THE ELIMINATION TREE AND THE FACTOR COLUMN C COUNTS TO COMPUTE THE SUPERNODE PARTITION; IT ALSO ASSUMES A C POSTORDERING OF THE ELIMINATION TREE. C C INPUT PARAMETERS: C (I) NEQNS - NUMBER OF EQUATIONS. C (I) ETPAR(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE C ELIMINATION TREE OF THE POSTORDERED MATRIX. C (I) COLCNT(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE C FACTOR COLUMN COUNTS: I.E., THE NUMBER OF C NONZERO ENTRIES IN EACH COLUMN OF L C (INCLUDING THE DIAGONAL ENTRY). C C OUTPUT PARAMETERS: C (I) NOFSUB - NUMBER OF SUBSCRIPTS. C (I) NSUPER - NUMBER OF SUPERNODES (<= NEQNS). C (I) SNODE(*) - ARRAY OF LENGTH NEQNS FOR RECORDING C SUPERNODE MEMBERSHIP. C C FIRST CREATED ON JANUARY 18, 1992. C LAST UPDATED ON NOVEMBER 11, 1994. C C*********************************************************************** C SUBROUTINE FSUP1 ( NEQNS , ETPAR , COLCNT, NOFSUB, NSUPER, & SNODE ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- INTEGER NEQNS , NOFSUB, NSUPER INTEGER COLCNT(*) , ETPAR(*) , & SNODE(*) C C ---------------- C LOCAL VARIABLES. C ---------------- INTEGER KCOL C C*********************************************************************** C C -------------------------------------------- C COMPUTE THE FUNDAMENTAL SUPERNODE PARTITION. C -------------------------------------------- NSUPER = 1 SNODE(1) = 1 NOFSUB = COLCNT(1) DO 300 KCOL = 2, NEQNS IF ( ETPAR(KCOL-1) .EQ. KCOL ) THEN IF ( COLCNT(KCOL-1) .EQ. COLCNT(KCOL)+1 ) THEN SNODE(KCOL) = NSUPER GO TO 300 ENDIF ENDIF NSUPER = NSUPER + 1 SNODE(KCOL) = NSUPER NOFSUB = NOFSUB + COLCNT(KCOL) 300 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C**************** FSUP2 ..... FIND SUPERNODES #2 ***************** C*********************************************************************** C*********************************************************************** C C PURPOSE: C THIS SUBROUTINE IS THE SECOND OF TWO ROUTINES FOR FINDING A C MAXIMAL SUPERNODE PARTITION. IT'S SOLE PURPOSE IS TO C CONSTRUCT THE NEEDED VECTOR OF LENGTH NSUPER: XSUPER(*). THE C FIRST ROUTINE FSUP1 COMPUTES THE NUMBER OF SUPERNODES AND THE C SUPERNODE MEMBERSHIP VECTOR SNODE(*), WHICH IS OF LENGTH NEQNS. C C C ASSUMPTIONS: C THIS ROUTINE ASSUMES A POSTORDERING OF THE ELIMINATION TREE. IT C ALSO ASSUMES THAT THE OUTPUT FROM FSUP1 IS AVAILABLE. C C INPUT PARAMETERS: C (I) NEQNS - NUMBER OF EQUATIONS. C (I) NSUPER - NUMBER OF SUPERNODES (<= NEQNS). C (I) SNODE(*) - ARRAY OF LENGTH NEQNS FOR RECORDING C SUPERNODE MEMBERSHIP. C C OUTPUT PARAMETERS: C (I) XSUPER(*) - ARRAY OF LENGTH NSUPER+1, CONTAINING THE C SUPERNODE PARTITIONING. C C FIRST CREATED ON JANUARY 18, 1992. C LAST UPDATED ON NOVEMEBER 22, 1994. C C*********************************************************************** C SUBROUTINE FSUP2 ( NEQNS , NSUPER, SNODE , XSUPER ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- INTEGER NEQNS , NSUPER INTEGER SNODE(*) , & XSUPER(*) C C ---------------- C LOCAL VARIABLES. C ---------------- INTEGER KCOL , KSUP , LSTSUP C C*********************************************************************** C C ------------------------------------------------- C COMPUTE THE SUPERNODE PARTITION VECTOR XSUPER(*). C ------------------------------------------------- LSTSUP = NSUPER + 1 DO 100 KCOL = NEQNS, 1, -1 KSUP = SNODE(KCOL) IF ( KSUP .NE. LSTSUP ) THEN XSUPER(LSTSUP) = KCOL + 1 ENDIF LSTSUP = KSUP 100 CONTINUE XSUPER(1) = 1 C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Joseph W.H. Liu C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C--- SPARSPAK-A (ANSI FORTRAN) RELEASE III --- NAME = GENMMD C (C) UNIVERSITY OF WATERLOO JANUARY 1984 C*********************************************************************** C*********************************************************************** C**** GENMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE ************ C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE IMPLEMENTS THE MINIMUM DEGREE C ALGORITHM. IT MAKES USE OF THE IMPLICIT REPRESENTATION C OF ELIMINATION GRAPHS BY QUOTIENT GRAPHS, AND THE C NOTION OF INDISTINGUISHABLE NODES. IT ALSO IMPLEMENTS C THE MODIFICATIONS BY MULTIPLE ELIMINATION AND MINIMUM C EXTERNAL DEGREE. C --------------------------------------------- C CAUTION - THE ADJACENCY VECTOR ADJNCY WILL BE C DESTROYED. C --------------------------------------------- C C INPUT PARAMETERS - C NEQNS - NUMBER OF EQUATIONS. C (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. C DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. C MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER C (ANY SMALLER ESTIMATE WILL DO) FOR MARKING C NODES. C C OUTPUT PARAMETERS - C PERM - THE MINIMUM DEGREE ORDERING. C INVP - THE INVERSE OF PERM. C NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO C SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. C C WORKING PARAMETERS - C DHEAD - VECTOR FOR HEAD OF DEGREE LISTS. C INVP - USED TEMPORARILY FOR DEGREE FORWARD LINK. C PERM - USED TEMPORARILY FOR DEGREE BACKWARD LINK. C QSIZE - VECTOR FOR SIZE OF SUPERNODES. C LLIST - VECTOR FOR TEMPORARY LINKED LISTS. C MARKER - A TEMPORARY MARKER VECTOR. C C PROGRAM SUBROUTINES - C MMDELM, MMDINT, MMDNUM, MMDUPD. C C*********************************************************************** C SUBROUTINE GENMMD ( NEQNS, XADJ, ADJNCY, INVP, PERM, 1 DELTA, DHEAD, QSIZE, LLIST, MARKER, 1 MAXINT, NOFSUB ) C C*********************************************************************** C INTEGER ADJNCY(*), DHEAD(*) , INVP(*) , LLIST(*) , 1 MARKER(*), PERM(*) , QSIZE(*) INTEGER XADJ(*) INTEGER DELTA , EHEAD , I , MAXINT, MDEG , 1 MDLMT , MDNODE, NEQNS , NEXTMD, NOFSUB, 1 NUM, TAG C C*********************************************************************** C IF ( NEQNS .LE. 0 ) RETURN C C ------------------------------------------------ C INITIALIZATION FOR THE MINIMUM DEGREE ALGORITHM. C ------------------------------------------------ NOFSUB = 0 CALL MMDINT ( NEQNS, XADJ, DHEAD, INVP, PERM, 1 QSIZE, LLIST, MARKER ) C C ---------------------------------------------- C NUM COUNTS THE NUMBER OF ORDERED NODES PLUS 1. C ---------------------------------------------- NUM = 1 C C ----------------------------- C ELIMINATE ALL ISOLATED NODES. C ----------------------------- NEXTMD = DHEAD(1) 100 CONTINUE IF ( NEXTMD .LE. 0 ) GO TO 200 MDNODE = NEXTMD NEXTMD = INVP(MDNODE) MARKER(MDNODE) = MAXINT INVP(MDNODE) = - NUM NUM = NUM + 1 GO TO 100 C 200 CONTINUE C ---------------------------------------- C SEARCH FOR NODE OF THE MINIMUM DEGREE. C MDEG IS THE CURRENT MINIMUM DEGREE; C TAG IS USED TO FACILITATE MARKING NODES. C ---------------------------------------- IF ( NUM .GT. NEQNS ) GO TO 1000 TAG = 1 DHEAD(1) = 0 MDEG = 2 300 CONTINUE IF ( DHEAD(MDEG) .GT. 0 ) GO TO 400 MDEG = MDEG + 1 GO TO 300 400 CONTINUE C ------------------------------------------------- C USE VALUE OF DELTA TO SET UP MDLMT, WHICH GOVERNS C WHEN A DEGREE UPDATE IS TO BE PERFORMED. C ------------------------------------------------- MDLMT = MDEG + DELTA EHEAD = 0 C 500 CONTINUE MDNODE = DHEAD(MDEG) IF ( MDNODE .GT. 0 ) GO TO 600 MDEG = MDEG + 1 IF ( MDEG .GT. MDLMT ) GO TO 900 GO TO 500 600 CONTINUE C ---------------------------------------- C REMOVE MDNODE FROM THE DEGREE STRUCTURE. C ---------------------------------------- NEXTMD = INVP(MDNODE) DHEAD(MDEG) = NEXTMD IF ( NEXTMD .GT. 0 ) PERM(NEXTMD) = - MDEG INVP(MDNODE) = - NUM NOFSUB = NOFSUB + MDEG + QSIZE(MDNODE) - 2 IF ( NUM+QSIZE(MDNODE) .GT. NEQNS ) GO TO 1000 C ---------------------------------------------- C ELIMINATE MDNODE AND PERFORM QUOTIENT GRAPH C TRANSFORMATION. RESET TAG VALUE IF NECESSARY. C ---------------------------------------------- TAG = TAG + 1 IF ( TAG .LT. MAXINT ) GO TO 800 TAG = 1 DO 700 I = 1, NEQNS IF ( MARKER(I) .LT. MAXINT ) MARKER(I) = 0 700 CONTINUE 800 CONTINUE CALL MMDELM ( MDNODE, XADJ, ADJNCY, DHEAD, INVP, 1 PERM, QSIZE, LLIST, MARKER, MAXINT, 1 TAG ) NUM = NUM + QSIZE(MDNODE) LLIST(MDNODE) = EHEAD EHEAD = MDNODE IF ( DELTA .GE. 0 ) GO TO 500 900 CONTINUE C ------------------------------------------- C UPDATE DEGREES OF THE NODES INVOLVED IN THE C MINIMUM DEGREE NODES ELIMINATION. C ------------------------------------------- IF ( NUM .GT. NEQNS ) GO TO 1000 CALL MMDUPD ( EHEAD, NEQNS, XADJ, ADJNCY, DELTA, MDEG, 1 DHEAD, INVP, PERM, QSIZE, LLIST, MARKER, 1 MAXINT, TAG ) GO TO 300 C 1000 CONTINUE CALL MMDNUM ( NEQNS, PERM, INVP, QSIZE ) RETURN C END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C****** IGATHR .... INTEGER GATHER OPERATION ************** C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE PERFORMS A STANDARD INTEGER GATHER C OPERATION. C C INPUT PARAMETERS - C KLEN - LENGTH OF THE LIST OF GLOBAL INDICES. C LINDX - LIST OF GLOBAL INDICES. C INDMAP - INDEXED BY GLOBAL INDICES, IT CONTAINS THE C REQUIRED RELATIVE INDICES. C C OUTPUT PARAMETERS - C RELIND - LIST RELATIVE INDICES. C C*********************************************************************** C SUBROUTINE IGATHR ( KLEN , LINDX, INDMAP, RELIND ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- INTEGER KLEN INTEGER INDMAP(*), LINDX (*), RELIND(*) C C ---------------- C LOCAL VARIABLES. C ---------------- INTEGER I C C*********************************************************************** C CDIR$ IVDEP DO 100 I = 1, KLEN RELIND(I) = INDMAP(LINDX(I)) 100 CONTINUE RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C C ------------------------------------------------------ C INPUT NUMERICAL VALUES INTO SPARSE DATA STRUCTURES ... C ------------------------------------------------------ C SUBROUTINE INPNV ( XADJF, ADJF, ANZF, PERM, INVP, & NSUPER, XSUPER, XLINDX, LINDX, & XLNZ, LNZ, OFFSET ) C INTEGER XADJF(*), ADJF(*) DOUBLE PRECISION ANZF(*) INTEGER PERM(*), INVP(*) INTEGER NSUPER INTEGER XSUPER(*), XLINDX(*), LINDX(*) INTEGER XLNZ(*) DOUBLE PRECISION LNZ(*) INTEGER OFFSET(*) C INTEGER I, II, J, JLEN, JSUPER, LAST, OLDJ C DO 500 JSUPER = 1, NSUPER C C ---------------------------------------- C FOR EACH SUPERNODE, DO THE FOLLOWING ... C ---------------------------------------- C C ----------------------------------------------- C FIRST GET OFFSET TO FACILITATE NUMERICAL INPUT. C ----------------------------------------------- JLEN = XLINDX(JSUPER+1) - XLINDX(JSUPER) DO 100 II = XLINDX(JSUPER), XLINDX(JSUPER+1)-1 I = LINDX(II) JLEN = JLEN - 1 OFFSET(I) = JLEN 100 CONTINUE C DO 400 J = XSUPER(JSUPER), XSUPER(JSUPER+1)-1 C ----------------------------------------- C FOR EACH COLUMN IN THE CURRENT SUPERNODE, C FIRST INITIALIZE THE DATA STRUCTURE. C ----------------------------------------- c DO 200 II = XLNZ(J), XLNZ(J+1)-1 c LNZ(II) = 0.0 c 200 CONTINUE c The previous lines are not required as R initializes the arrays c Reinhard Furrer, Nov 19, 2007 C C ----------------------------------- C NEXT INPUT THE INDIVIDUAL NONZEROS. C ----------------------------------- OLDJ = PERM(J) LAST = XLNZ(J+1) - 1 DO 300 II = XADJF(OLDJ), XADJF(OLDJ+1)-1 I = INVP(ADJF(II)) IF ( I .GE. J ) THEN LNZ(LAST-OFFSET(I)) = ANZF(II) ENDIF 300 CONTINUE 400 CONTINUE C 500 CONTINUE RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Joseph W.H. Liu C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C*********** INVINV ..... CONCATENATION OF TWO INVP ************ C*********************************************************************** C*********************************************************************** C C WRITTEN BY JOSEPH LIU (JUL 17, 1985) C C PURPOSE: C TO PERFORM THE MAPPING OF C ORIGINAL-INVP --> INTERMEDIATE-INVP --> NEW INVP C AND THE RESULTING ORDERING REPLACES INVP. THE NEW PERMUTATION C VECTOR PERM IS ALSO COMPUTED. C C INPUT PARAMETERS: C NEQNS - NUMBER OF EQUATIONS. C INVP2 - THE SECOND INVERSE PERMUTATION VECTOR. C C UPDATED PARAMETERS: C INVP - THE FIRST INVERSE PERMUTATION VECTOR. ON C OUTPUT, IT CONTAINS THE NEW INVERSE C PERMUTATION. C C OUTPUT PARAMETER: C PERM - NEW PERMUTATION VECTOR (CAN BE THE SAME AS C INVP2). C C*********************************************************************** C SUBROUTINE INVINV ( NEQNS , INVP , INVP2 , PERM ) C C*********************************************************************** C INTEGER(4) INVP(*) , INVP2(*) , & PERM(*) C INTEGER(4) NEQNS C C*********************************************************************** C INTEGER(4) I , INTERM, NODE C C*********************************************************************** C DO 100 I = 1, NEQNS INTERM = INVP(I) INVP(I) = INVP2(INTERM) 100 CONTINUE C DO 200 I = 1, NEQNS NODE = INVP(I) PERM(NODE) = I 200 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C****** LDINDX .... LOAD INDEX VECTOR ************** C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE COMPUTES THE SECOND INDEX VECTOR C USED TO IMPLEMENT THE DOUBLY-INDIRECT SAXPY-LIKE C LOOPS THAT ALLOW US TO ACCUMULATE UPDATE C COLUMNS DIRECTLY INTO FACTOR STORAGE. C C INPUT PARAMETERS - C JLEN - LENGTH OF THE FIRST COLUMN OF THE SUPERNODE, C INCLUDING THE DIAGONAL ENTRY. C LINDX - THE OFF-DIAGONAL ROW INDICES OF THE SUPERNODE, C I.E., THE ROW INDICES OF THE NONZERO ENTRIES C LYING BELOW THE DIAGONAL ENTRY OF THE FIRST C COLUMN OF THE SUPERNODE. C C OUTPUT PARAMETERS - C INDMAP - THIS INDEX VECTOR MAPS EVERY GLOBAL ROW INDEX C OF NONZERO ENTRIES IN THE FIRST COLUMN OF THE C SUPERNODE TO ITS POSITION IN THE INDEX LIST C RELATIVE TO THE LAST INDEX IN THE LIST. MORE C PRECISELY, IT GIVES THE DISTANCE OF EACH INDEX C FROM THE LAST INDEX IN THE LIST. C C*********************************************************************** C SUBROUTINE LDINDX ( JLEN, LINDX, INDMAP ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- INTEGER JLEN INTEGER LINDX(*), INDMAP(*) C C ---------------- C LOCAL VARIABLES. C ---------------- INTEGER CURLEN, J, JSUB C C*********************************************************************** C CURLEN = JLEN DO 200 J = 1, JLEN JSUB = LINDX(J) CURLEN = CURLEN - 1 INDMAP(JSUB) = CURLEN 200 CONTINUE RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Joseph W.H. Liu C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C--- SPARSPAK-A (ANSI FORTRAN) RELEASE III --- NAME = MMDELM C (C) UNIVERSITY OF WATERLOO JANUARY 1984 C*********************************************************************** C*********************************************************************** C** MMDELM ..... MULTIPLE MINIMUM DEGREE ELIMINATION *********** C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE ELIMINATES THE NODE MDNODE OF C MINIMUM DEGREE FROM THE ADJACENCY STRUCTURE, WHICH C IS STORED IN THE QUOTIENT GRAPH FORMAT. IT ALSO C TRANSFORMS THE QUOTIENT GRAPH REPRESENTATION OF THE C ELIMINATION GRAPH. C C INPUT PARAMETERS - C MDNODE - NODE OF MINIMUM DEGREE. C MAXINT - ESTIMATE OF MAXIMUM REPRESENTABLE (SHORT) C INTEGER. C TAG - TAG VALUE. C C UPDATED PARAMETERS - C (XADJ,ADJNCY) - UPDATED ADJACENCY STRUCTURE. C (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. C QSIZE - SIZE OF SUPERNODE. C MARKER - MARKER VECTOR. C LLIST - TEMPORARY LINKED LIST OF ELIMINATED NABORS. C C*********************************************************************** C SUBROUTINE MMDELM ( MDNODE, XADJ, ADJNCY, DHEAD, DFORW, 1 DBAKW, QSIZE, LLIST, MARKER, MAXINT, 1 TAG ) C C*********************************************************************** C INTEGER ADJNCY(*), DBAKW(*) , DFORW(*) , DHEAD(*) , 1 LLIST(*) , MARKER(*), QSIZE(*) INTEGER XADJ(*) INTEGER ELMNT , I , ISTOP , ISTRT , J , 1 JSTOP , JSTRT , LINK , MAXINT, MDNODE, 1 NABOR , NODE , NPV , NQNBRS, NXNODE, 1 PVNODE, RLMT , RLOC , RNODE , TAG , 1 XQNBR C C*********************************************************************** C C ----------------------------------------------- C FIND REACHABLE SET AND PLACE IN DATA STRUCTURE. C ----------------------------------------------- MARKER(MDNODE) = TAG ISTRT = XADJ(MDNODE) ISTOP = XADJ(MDNODE+1) - 1 C ------------------------------------------------------- C ELMNT POINTS TO THE BEGINNING OF THE LIST OF ELIMINATED C NABORS OF MDNODE, AND RLOC GIVES THE STORAGE LOCATION C FOR THE NEXT REACHABLE NODE. C ------------------------------------------------------- ELMNT = 0 RLOC = ISTRT RLMT = ISTOP DO 200 I = ISTRT, ISTOP NABOR = ADJNCY(I) IF ( NABOR .EQ. 0 ) GO TO 300 IF ( MARKER(NABOR) .GE. TAG ) GO TO 200 MARKER(NABOR) = TAG IF ( DFORW(NABOR) .LT. 0 ) GO TO 100 ADJNCY(RLOC) = NABOR RLOC = RLOC + 1 GO TO 200 100 CONTINUE LLIST(NABOR) = ELMNT ELMNT = NABOR 200 CONTINUE 300 CONTINUE C ----------------------------------------------------- C MERGE WITH REACHABLE NODES FROM GENERALIZED ELEMENTS. C ----------------------------------------------------- IF ( ELMNT .LE. 0 ) GO TO 1000 ADJNCY(RLMT) = - ELMNT LINK = ELMNT 400 CONTINUE JSTRT = XADJ(LINK) JSTOP = XADJ(LINK+1) - 1 DO 800 J = JSTRT, JSTOP NODE = ADJNCY(J) LINK = - NODE C IF ( NODE ) 400, 900, 500 if ( NODE .LT. 0) GO TO 400 if ( NODE .EQ. 0) GO TO 900 C 500 CONTINUE IF ( MARKER(NODE) .GE. TAG .OR. 1 DFORW(NODE) .LT. 0 ) GO TO 800 MARKER(NODE) = TAG C --------------------------------- C USE STORAGE FROM ELIMINATED NODES C IF NECESSARY. C --------------------------------- 600 CONTINUE IF ( RLOC .LT. RLMT ) GO TO 700 LINK = - ADJNCY(RLMT) RLOC = XADJ(LINK) RLMT = XADJ(LINK+1) - 1 GO TO 600 700 CONTINUE ADJNCY(RLOC) = NODE RLOC = RLOC + 1 800 CONTINUE 900 CONTINUE ELMNT = LLIST(ELMNT) GO TO 300 1000 CONTINUE IF ( RLOC .LE. RLMT ) ADJNCY(RLOC) = 0 C -------------------------------------------------------- C FOR EACH NODE IN THE REACHABLE SET, DO THE FOLLOWING ... C -------------------------------------------------------- LINK = MDNODE 1100 CONTINUE ISTRT = XADJ(LINK) ISTOP = XADJ(LINK+1) - 1 DO 1700 I = ISTRT, ISTOP RNODE = ADJNCY(I) LINK = - RNODE C IF ( RNODE ) 1100, 1800, 1200 if ( RNODE .LT. 0) GO TO 1100 if ( RNODE .EQ. 0) GO TO 1800 C 1200 CONTINUE C -------------------------------------------- C IF RNODE IS IN THE DEGREE LIST STRUCTURE ... C -------------------------------------------- PVNODE = DBAKW(RNODE) IF ( PVNODE .EQ. 0 .OR. 1 PVNODE .EQ. (-MAXINT) ) GO TO 1300 C ------------------------------------- C THEN REMOVE RNODE FROM THE STRUCTURE. C ------------------------------------- NXNODE = DFORW(RNODE) IF ( NXNODE .GT. 0 ) DBAKW(NXNODE) = PVNODE IF ( PVNODE .GT. 0 ) DFORW(PVNODE) = NXNODE NPV = - PVNODE IF ( PVNODE .LT. 0 ) DHEAD(NPV) = NXNODE 1300 CONTINUE C ---------------------------------------- C PURGE INACTIVE QUOTIENT NABORS OF RNODE. C ---------------------------------------- JSTRT = XADJ(RNODE) JSTOP = XADJ(RNODE+1) - 1 XQNBR = JSTRT DO 1400 J = JSTRT, JSTOP NABOR = ADJNCY(J) IF ( NABOR .EQ. 0 ) GO TO 1500 IF ( MARKER(NABOR) .GE. TAG ) GO TO 1400 ADJNCY(XQNBR) = NABOR XQNBR = XQNBR + 1 1400 CONTINUE 1500 CONTINUE C ---------------------------------------- C IF NO ACTIVE NABOR AFTER THE PURGING ... C ---------------------------------------- NQNBRS = XQNBR - JSTRT IF ( NQNBRS .GT. 0 ) GO TO 1600 C ----------------------------- C THEN MERGE RNODE WITH MDNODE. C ----------------------------- QSIZE(MDNODE) = QSIZE(MDNODE) + QSIZE(RNODE) QSIZE(RNODE) = 0 MARKER(RNODE) = MAXINT DFORW(RNODE) = - MDNODE DBAKW(RNODE) = - MAXINT GO TO 1700 1600 CONTINUE C -------------------------------------- C ELSE FLAG RNODE FOR DEGREE UPDATE, AND C ADD MDNODE AS A NABOR OF RNODE. C -------------------------------------- DFORW(RNODE) = NQNBRS + 1 DBAKW(RNODE) = 0 ADJNCY(XQNBR) = MDNODE XQNBR = XQNBR + 1 IF ( XQNBR .LE. JSTOP ) ADJNCY(XQNBR) = 0 C 1700 CONTINUE 1800 CONTINUE RETURN C END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Joseph W.H. Liu C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C--- SPARSPAK-A (ANSI FORTRAN) RELEASE III --- NAME = MMDINT C (C) UNIVERSITY OF WATERLOO JANUARY 1984 C*********************************************************************** C*********************************************************************** C*** MMDINT ..... MULT MINIMUM DEGREE INITIALIZATION *********** C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE PERFORMS INITIALIZATION FOR THE C MULTIPLE ELIMINATION VERSION OF THE MINIMUM DEGREE C ALGORITHM. C C INPUT PARAMETERS - C NEQNS - NUMBER OF EQUATIONS. C (XADJ,ADJNCY) - ADJACENCY STRUCTURE. C C OUTPUT PARAMETERS - C (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. C QSIZE - SIZE OF SUPERNODE (INITIALIZED TO ONE). C LLIST - LINKED LIST. C MARKER - MARKER VECTOR. C C*********************************************************************** C SUBROUTINE MMDINT ( NEQNS, XADJ, DHEAD, DFORW, 1 DBAKW, QSIZE, LLIST, MARKER ) C C*********************************************************************** C INTEGER DBAKW(*) , DFORW(*) , DHEAD(*) , 1 LLIST(*) , MARKER(*), QSIZE(*) INTEGER XADJ(*) INTEGER FNODE , NDEG , NEQNS , NODE C C*********************************************************************** C DO 100 NODE = 1, NEQNS DHEAD(NODE) = 0 QSIZE(NODE) = 1 MARKER(NODE) = 0 LLIST(NODE) = 0 100 CONTINUE C ------------------------------------------ C INITIALIZE THE DEGREE DOUBLY LINKED LISTS. C ------------------------------------------ DO 200 NODE = 1, NEQNS NDEG = XADJ(NODE+1) - XADJ(NODE) + 1 FNODE = DHEAD(NDEG) DFORW(NODE) = FNODE DHEAD(NDEG) = NODE IF ( FNODE .GT. 0 ) DBAKW(FNODE) = NODE DBAKW(NODE) = - NDEG 200 CONTINUE RETURN C END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Joseph W.H. Liu C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C--- SPARSPAK-A (ANSI FORTRAN) RELEASE III --- NAME = MMDNUM C (C) UNIVERSITY OF WATERLOO JANUARY 1984 C*********************************************************************** C*********************************************************************** C***** MMDNUM ..... MULTI MINIMUM DEGREE NUMBERING ************* C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE PERFORMS THE FINAL STEP IN C PRODUCING THE PERMUTATION AND INVERSE PERMUTATION C VECTORS IN THE MULTIPLE ELIMINATION VERSION OF THE C MINIMUM DEGREE ORDERING ALGORITHM. C C INPUT PARAMETERS - C NEQNS - NUMBER OF EQUATIONS. C QSIZE - SIZE OF SUPERNODES AT ELIMINATION. C C UPDATED PARAMETERS - C INVP - INVERSE PERMUTATION VECTOR. ON INPUT, C IF QSIZE(NODE)=0, THEN NODE HAS BEEN MERGED C INTO THE NODE -INVP(NODE); OTHERWISE, C -INVP(NODE) IS ITS INVERSE LABELLING. C C OUTPUT PARAMETERS - C PERM - THE PERMUTATION VECTOR. C C*********************************************************************** C SUBROUTINE MMDNUM ( NEQNS, PERM, INVP, QSIZE ) C C*********************************************************************** C INTEGER INVP(*) , PERM(*) , QSIZE(*) INTEGER FATHER, NEQNS , NEXTF , NODE , NQSIZE, 1 NUM , ROOT C C*********************************************************************** C DO 100 NODE = 1, NEQNS NQSIZE = QSIZE(NODE) IF ( NQSIZE .LE. 0 ) PERM(NODE) = INVP(NODE) IF ( NQSIZE .GT. 0 ) PERM(NODE) = - INVP(NODE) 100 CONTINUE C ------------------------------------------------------ C FOR EACH NODE WHICH HAS BEEN MERGED, DO THE FOLLOWING. C ------------------------------------------------------ DO 500 NODE = 1, NEQNS IF ( PERM(NODE) .GT. 0 ) GO TO 500 C ----------------------------------------- C TRACE THE MERGED TREE UNTIL ONE WHICH HAS C NOT BEEN MERGED, CALL IT ROOT. C ----------------------------------------- FATHER = NODE 200 CONTINUE IF ( PERM(FATHER) .GT. 0 ) GO TO 300 FATHER = - PERM(FATHER) GO TO 200 300 CONTINUE C ----------------------- C NUMBER NODE AFTER ROOT. C ----------------------- ROOT = FATHER NUM = PERM(ROOT) + 1 INVP(NODE) = - NUM PERM(ROOT) = NUM C ------------------------ C SHORTEN THE MERGED TREE. C ------------------------ FATHER = NODE 400 CONTINUE NEXTF = - PERM(FATHER) IF ( NEXTF .LE. 0 ) GO TO 500 PERM(FATHER) = - ROOT FATHER = NEXTF GO TO 400 500 CONTINUE C ---------------------- C READY TO COMPUTE PERM. C ---------------------- DO 600 NODE = 1, NEQNS NUM = - INVP(NODE) INVP(NODE) = NUM PERM(NUM) = NODE 600 CONTINUE RETURN C END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Joseph W.H. Liu C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C--- SPARSPAK-A (ANSI FORTRAN) RELEASE III --- NAME = MMDUPD C (C) UNIVERSITY OF WATERLOO JANUARY 1984 C*********************************************************************** C*********************************************************************** C***** MMDUPD ..... MULTIPLE MINIMUM DEGREE UPDATE ************* C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE UPDATES THE DEGREES OF NODES C AFTER A MULTIPLE ELIMINATION STEP. C C INPUT PARAMETERS - C EHEAD - THE BEGINNING OF THE LIST OF ELIMINATED C NODES (I.E., NEWLY FORMED ELEMENTS). C NEQNS - NUMBER OF EQUATIONS. C (XADJ,ADJNCY) - ADJACENCY STRUCTURE. C DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. C MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) C INTEGER. C C UPDATED PARAMETERS - C MDEG - NEW MINIMUM DEGREE AFTER DEGREE UPDATE. C (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. C QSIZE - SIZE OF SUPERNODE. C LLIST - WORKING LINKED LIST. C MARKER - MARKER VECTOR FOR DEGREE UPDATE. C TAG - TAG VALUE. C C*********************************************************************** C SUBROUTINE MMDUPD ( EHEAD, NEQNS, XADJ, ADJNCY, DELTA, 1 MDEG, DHEAD, DFORW, DBAKW, QSIZE, 1 LLIST, MARKER, MAXINT, TAG ) C C*********************************************************************** C INTEGER ADJNCY(*), DBAKW(*) , DFORW(*) , DHEAD(*) , 1 LLIST(*) , MARKER(*), QSIZE(*) INTEGER XADJ(*) INTEGER DEG , DEG0 , DELTA , EHEAD , ELMNT , 1 ENODE , FNODE , I , IQ2 , ISTOP , 1 ISTRT , J , JSTOP , JSTRT , LINK , 1 MAXINT, MDEG , MDEG0 , MTAG , NABOR , 1 NEQNS , NODE , Q2HEAD, QXHEAD, TAG C C*********************************************************************** C MDEG0 = MDEG + DELTA ELMNT = EHEAD 100 CONTINUE C ------------------------------------------------------- C FOR EACH OF THE NEWLY FORMED ELEMENT, DO THE FOLLOWING. C (RESET TAG VALUE IF NECESSARY.) C ------------------------------------------------------- IF ( ELMNT .LE. 0 ) RETURN MTAG = TAG + MDEG0 IF ( MTAG .LT. MAXINT ) GO TO 300 TAG = 1 DO 200 I = 1, NEQNS IF ( MARKER(I) .LT. MAXINT ) MARKER(I) = 0 200 CONTINUE MTAG = TAG + MDEG0 300 CONTINUE C --------------------------------------------- C CREATE TWO LINKED LISTS FROM NODES ASSOCIATED C WITH ELMNT: ONE WITH TWO NABORS (Q2HEAD) IN C ADJACENCY STRUCTURE, AND THE OTHER WITH MORE C THAN TWO NABORS (QXHEAD). ALSO COMPUTE DEG0, C NUMBER OF NODES IN THIS ELEMENT. C --------------------------------------------- Q2HEAD = 0 QXHEAD = 0 DEG0 = 0 LINK = ELMNT 400 CONTINUE ISTRT = XADJ(LINK) ISTOP = XADJ(LINK+1) - 1 DO 700 I = ISTRT, ISTOP ENODE = ADJNCY(I) LINK = - ENODE C IF ( ENODE ) 400, 800, 500 if ( ENODE .LT. 0) GO TO 400 if ( ENODE .EQ. 0) GO TO 800 C C 500 CONTINUE IF ( QSIZE(ENODE) .EQ. 0 ) GO TO 700 DEG0 = DEG0 + QSIZE(ENODE) MARKER(ENODE) = MTAG C ---------------------------------- C IF ENODE REQUIRES A DEGREE UPDATE, C THEN DO THE FOLLOWING. C ---------------------------------- IF ( DBAKW(ENODE) .NE. 0 ) GO TO 700 C --------------------------------------- C PLACE EITHER IN QXHEAD OR Q2HEAD LISTS. C --------------------------------------- IF ( DFORW(ENODE) .EQ. 2 ) GO TO 600 LLIST(ENODE) = QXHEAD QXHEAD = ENODE GO TO 700 600 CONTINUE LLIST(ENODE) = Q2HEAD Q2HEAD = ENODE 700 CONTINUE 800 CONTINUE C -------------------------------------------- C FOR EACH ENODE IN Q2 LIST, DO THE FOLLOWING. C -------------------------------------------- ENODE = Q2HEAD IQ2 = 1 900 CONTINUE IF ( ENODE .LE. 0 ) GO TO 1500 IF ( DBAKW(ENODE) .NE. 0 ) GO TO 2200 TAG = TAG + 1 DEG = DEG0 C ------------------------------------------ C IDENTIFY THE OTHER ADJACENT ELEMENT NABOR. C ------------------------------------------ ISTRT = XADJ(ENODE) NABOR = ADJNCY(ISTRT) IF ( NABOR .EQ. ELMNT ) NABOR = ADJNCY(ISTRT+1) C ------------------------------------------------ C IF NABOR IS UNELIMINATED, INCREASE DEGREE COUNT. C ------------------------------------------------ LINK = NABOR IF ( DFORW(NABOR) .LT. 0 ) GO TO 1000 DEG = DEG + QSIZE(NABOR) GO TO 2100 1000 CONTINUE C -------------------------------------------- C OTHERWISE, FOR EACH NODE IN THE 2ND ELEMENT, C DO THE FOLLOWING. C -------------------------------------------- ISTRT = XADJ(LINK) ISTOP = XADJ(LINK+1) - 1 DO 1400 I = ISTRT, ISTOP NODE = ADJNCY(I) LINK = - NODE IF ( NODE .EQ. ENODE ) GO TO 1400 C IF ( NODE ) 1000, 2100, 1100 if ( NODE .LT. 0 ) GO TO 1000 if ( NODE .EQ. 0 ) GO TO 2100 C C 1100 CONTINUE IF ( QSIZE(NODE) .EQ. 0 ) GO TO 1400 IF ( MARKER(NODE) .GE. TAG ) GO TO 1200 C ------------------------------------- C CASE WHEN NODE IS NOT YET CONSIDERED. C ------------------------------------- MARKER(NODE) = TAG DEG = DEG + QSIZE(NODE) GO TO 1400 1200 CONTINUE C ---------------------------------------- C CASE WHEN NODE IS INDISTINGUISHABLE FROM C ENODE. MERGE THEM INTO A NEW SUPERNODE. C ---------------------------------------- IF ( DBAKW(NODE) .NE. 0 ) GO TO 1400 IF ( DFORW(NODE) .NE. 2 ) GO TO 1300 QSIZE(ENODE) = QSIZE(ENODE) + 1 QSIZE(NODE) QSIZE(NODE) = 0 MARKER(NODE) = MAXINT DFORW(NODE) = - ENODE DBAKW(NODE) = - MAXINT GO TO 1400 1300 CONTINUE C -------------------------------------- C CASE WHEN NODE IS OUTMATCHED BY ENODE. C -------------------------------------- IF ( DBAKW(NODE) .EQ.0 ) 1 DBAKW(NODE) = - MAXINT 1400 CONTINUE GO TO 2100 1500 CONTINUE C ------------------------------------------------ C FOR EACH ENODE IN THE QX LIST, DO THE FOLLOWING. C ------------------------------------------------ ENODE = QXHEAD IQ2 = 0 1600 CONTINUE IF ( ENODE .LE. 0 ) GO TO 2300 IF ( DBAKW(ENODE) .NE. 0 ) GO TO 2200 TAG = TAG + 1 DEG = DEG0 C --------------------------------- C FOR EACH UNMARKED NABOR OF ENODE, C DO THE FOLLOWING. C --------------------------------- ISTRT = XADJ(ENODE) ISTOP = XADJ(ENODE+1) - 1 DO 2000 I = ISTRT, ISTOP NABOR = ADJNCY(I) IF ( NABOR .EQ. 0 ) GO TO 2100 IF ( MARKER(NABOR) .GE. TAG ) GO TO 2000 MARKER(NABOR) = TAG LINK = NABOR C ------------------------------ C IF UNELIMINATED, INCLUDE IT IN C DEG COUNT. C ------------------------------ IF ( DFORW(NABOR) .LT. 0 ) GO TO 1700 DEG = DEG + QSIZE(NABOR) GO TO 2000 1700 CONTINUE C ------------------------------- C IF ELIMINATED, INCLUDE UNMARKED C NODES IN THIS ELEMENT INTO THE C DEGREE COUNT. C ------------------------------- JSTRT = XADJ(LINK) JSTOP = XADJ(LINK+1) - 1 DO 1900 J = JSTRT, JSTOP NODE = ADJNCY(J) LINK = - NODE C IF ( NODE ) 1700, 2000, 1800 if ( NODE .LT. 0) GO TO 1700 if ( NODE .EQ. 0) GO TO 2000 C C 1800 CONTINUE IF ( MARKER(NODE) .GE. TAG ) 1 GO TO 1900 MARKER(NODE) = TAG DEG = DEG + QSIZE(NODE) 1900 CONTINUE 2000 CONTINUE 2100 CONTINUE C ------------------------------------------- C UPDATE EXTERNAL DEGREE OF ENODE IN DEGREE C STRUCTURE, AND MDEG (MIN DEG) IF NECESSARY. C ------------------------------------------- DEG = DEG - QSIZE(ENODE) + 1 FNODE = DHEAD(DEG) DFORW(ENODE) = FNODE DBAKW(ENODE) = - DEG IF ( FNODE .GT. 0 ) DBAKW(FNODE) = ENODE DHEAD(DEG) = ENODE IF ( DEG .LT. MDEG ) MDEG = DEG 2200 CONTINUE C ---------------------------------- C GET NEXT ENODE IN CURRENT ELEMENT. C ---------------------------------- ENODE = LLIST(ENODE) IF ( IQ2 .EQ. 1 ) GO TO 900 GO TO 1600 2300 CONTINUE C ----------------------------- C GET NEXT ELEMENT IN THE LIST. C ----------------------------- TAG = MTAG ELMNT = LLIST(ELMNT) GO TO 100 C END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C RF: modified mmpy8 dependence C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C************** MMPY .... MATRIX-MATRIX MULTIPLY ************** C*********************************************************************** C*********************************************************************** C C PURPOSE - C THIS ROUTINE PERFORMS A MATRIX-MATRIX MULTIPLY, Y = Y + XA, C ASSUMING DATA STRUCTURES USED IN SOME OF OUR SPARSE CHOLESKY C CODES. C C INPUT PARAMETERS - C M - NUMBER OF ROWS IN X AND IN Y. C N - NUMBER OF COLUMNS IN X AND NUMBER OF ROWS C IN A. C Q - NUMBER OF COLUMNS IN A AND Y. C SPLIT(*) - BLOCK PARTITIONING OF X. C XPNT(*) - XPNT(J+1) POINTS ONE LOCATION BEYOND THE C END OF THE J-TH COLUMN OF X. XPNT IS ALSO C USED TO ACCESS THE ROWS OF A. C X(*) - CONTAINS THE COLUMNS OF X AND THE ROWS OF A. C LDY - LENGTH OF FIRST COLUMN OF Y. C C EXTERNAL ROUTINES: C MMPYN - MATRIX-MATRIX MULTIPLY, C WITH LEVEL 8 LOOP UNROLLING. C C UPDATED PARAMETERS - C Y(*) - ON OUTPUT, Y = Y + AX. C C*********************************************************************** C SUBROUTINE MMPY ( M , N , Q , SPLIT , XPNT , & X , Y , LDY ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- C EXTERNAL MMPY8 INTEGER LDY , M , N , Q INTEGER SPLIT(*) , XPNT(*) DOUBLE PRECISION X(*) , Y(*) C C ---------------- C LOCAL VARIABLES. C ---------------- C INTEGER BLK , FSTCOL, NN C C*********************************************************************** C BLK = 1 FSTCOL = 1 100 CONTINUE IF ( FSTCOL .LE. N ) THEN NN = SPLIT(BLK) CALL MMPY8 ( M, NN, Q, XPNT(FSTCOL), X, Y, LDY ) FSTCOL = FSTCOL + NN BLK = BLK + 1 GO TO 100 ENDIF RETURN C END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: May 26, 1995 C Authors: Esmond G. Ng, Barry W. Peyton, and Guodong Zhang C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C************* MMPY8 .... MATRIX-MATRIX MULTIPLY ************** C*********************************************************************** C*********************************************************************** C C PURPOSE - C THIS ROUTINE PERFORMS A MATRIX-MATRIX MULTIPLY, Y = Y + XA, C ASSUMING DATA STRUCTURES USED IN SOME OF OUR SPARSE CHOLESKY C CODES. C C LOOP UNROLLING: LEVEL 8 UPDATING TWO COLUMNS AT A TIME C C INPUT PARAMETERS - C M - NUMBER OF ROWS IN X AND IN Y. C N - NUMBER OF COLUMNS IN X AND NUMBER OF ROWS C IN A. C Q - NUMBER OF COLUMNS IN A AND Y. C XPNT(*) - XPNT(J+1) POINTS ONE LOCATION BEYOND THE C END OF THE J-TH COLUMN OF X. XPNT IS ALSO C USED TO ACCESS THE ROWS OF A. C X(*) - CONTAINS THE COLUMNS OF X AND THE ROWS OF A. C LDY - LENGTH OF FIRST COLUMN OF Y. C C UPDATED PARAMETERS - C Y(*) - ON OUTPUT, Y = Y + AX. C C*********************************************************************** C SUBROUTINE MMPY8 ( M , N , Q , XPNT , X , & Y , LDY ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- C INTEGER LDY , M , N , Q INTEGER XPNT(*) DOUBLE PRECISION X(*) , Y(*) C C ---------------- C LOCAL VARIABLES. C ---------------- C INTEGER I , J , K , QQ INTEGER I1 , I2 , I3 , I4 , I5 , & I6 , I7 , I8 INTEGER IYBEG , IYBEG1, IYBEG2, LENY , MM DOUBLE PRECISION A1 , A2 , A3 , A4 , A5 , & A6 , A7 , A8 , A9 , A10 , & A11 , A12 , A13 , A14 , A15 , & A16 DOUBLE PRECISION B1 , B2 , B3 , B4 , B5 , & B6 , B7 , B8 , Y1 , Y2 C C*********************************************************************** C C ---------------------------------------------------- C COMPUTE EACH DIAGONAL ENTRY OF THE ODD COLUMNS OF Y. C ---------------------------------------------------- C MM = M QQ = MIN(M,Q) IYBEG = 1 LENY = LDY - 1 DO 200 J = 1, QQ-1 , 2 CDIR$ IVDEP DO 100 I = 1, N I1 = XPNT(I+1) - MM A1 = X(I1) Y(IYBEG) = Y(IYBEG) - A1*A1 100 CONTINUE IYBEG = IYBEG + 2*LENY + 1 LENY = LENY - 2 MM = MM - 2 200 CONTINUE C C ------------------------------------------------------- C UPDATE TWO COLUMNS OF Y AT A TIME, EXCEPT THE DIAGONAL C ELEMENT. C NOTE: THE DIAGONAL ELEMENT OF THE ODD COLUMN HAS C BEEN COMPUTED, SO WE COMPUTE THE SAME NUMBER OF C ELEMENTS FOR THE TWO COLUMNS. C ------------------------------------------------------- C MM = M IYBEG = 1 LENY = LDY - 1 C DO 3000 J = 1, QQ-1, 2 C IYBEG1 = IYBEG IYBEG2 = IYBEG + LENY C DO 400 K = 1, N-7, 8 C C ----------------------------------- C EIGHT COLUMNS UPDATING TWO COLUMNS. C ----------------------------------- C I1 = XPNT(K+1) - MM I2 = XPNT(K+2) - MM I3 = XPNT(K+3) - MM I4 = XPNT(K+4) - MM I5 = XPNT(K+5) - MM I6 = XPNT(K+6) - MM I7 = XPNT(K+7) - MM I8 = XPNT(K+8) - MM A1 = X(I1) A2 = X(I2) A3 = X(I3) A4 = X(I4) A5 = X(I5) A6 = X(I6) A7 = X(I7) A8 = X(I8) A9 = X(I1+1) A10 = X(I2+1) A11 = X(I3+1) A12 = X(I4+1) A13 = X(I5+1) A14 = X(I6+1) A15 = X(I7+1) A16 = X(I8+1) C Y(IYBEG1+1) = Y(IYBEG1+1) - & A1*A9 - A2*A10 - A3*A11 - A4*A12 - A5*A13 - & A6*A14 - A7*A15 - A8*A16 C Y(IYBEG2+1) = Y(IYBEG2+1) - & A9*A9 - A10*A10 - A11*A11 - A12*A12 - A13*A13 - & A14*A14 - A15*A15 - A16*A16 C DO 300 I = 2, MM-1 Y1 = Y(IYBEG1+I) B1 = X(I1+I) Y1 = Y1 - B1 * A1 Y2 = Y(IYBEG2+I) B2 = X(I2+I) Y2 = Y2 - B1 * A9 Y1 = Y1 - B2 * A2 B3 = X(I3+I) Y2 = Y2 - B2 * A10 Y1 = Y1 - B3 * A3 B4 = X(I4+I) Y2 = Y2 - B3 * A11 Y1 = Y1 - B4 * A4 B5 = X(I5+I) Y2 = Y2 - B4 * A12 Y1 = Y1 - B5 * A5 B6 = X(I6+I) Y2 = Y2 - B5 * A13 Y1 = Y1 - B6 * A6 B7 = X(I7+I) Y2 = Y2 - B6 * A14 Y1 = Y1 - B7 * A7 B8 = X(I8+I) Y2 = Y2 - B7 * A15 Y1 = Y1 - B8 * A8 Y(IYBEG1+I) = Y1 Y2 = Y2 - B8 * A16 Y(IYBEG2+I) = Y2 300 CONTINUE C 400 CONTINUE C C ----------------------------- C BOUNDARY CODE FOR THE K LOOP. C ----------------------------- C C GO TO ( 2000, 1700, 1500, 1300, C & 1100, 900, 700, 500 ), N-K+2 if (N .LT. K) go to 2000 if (N .EQ. K) go to 1700 if (N .EQ. K + 1) go to 1500 if (N .EQ. K + 2) go to 1300 if (N .EQ. K + 3) go to 1100 if (N .EQ. K + 4) go to 900 if (N .EQ. K + 5) go to 700 C C 500 CONTINUE C C ----------------------------------- C SEVEN COLUMNS UPDATING TWO COLUMNS. C ----------------------------------- C I1 = XPNT(K+1) - MM I2 = XPNT(K+2) - MM I3 = XPNT(K+3) - MM I4 = XPNT(K+4) - MM I5 = XPNT(K+5) - MM I6 = XPNT(K+6) - MM I7 = XPNT(K+7) - MM A1 = X(I1) A2 = X(I2) A3 = X(I3) A4 = X(I4) A5 = X(I5) A6 = X(I6) A7 = X(I7) A9 = X(I1+1) A10 = X(I2+1) A11 = X(I3+1) A12 = X(I4+1) A13 = X(I5+1) A14 = X(I6+1) A15 = X(I7+1) C Y(IYBEG1+1) = Y(IYBEG1+1) - & A1*A9 - A2*A10 - A3*A11 - A4*A12 - A5*A13 - & A6*A14 - A7*A15 C Y(IYBEG2+1) = Y(IYBEG2+1) - & A9*A9 - A10*A10 - A11*A11 - A12*A12 - A13*A13 - & A14*A14 - A15*A15 C DO 600 I = 2, MM-1 Y1 = Y(IYBEG1+I) B1 = X(I1+I) Y1 = Y1 - B1 * A1 Y2 = Y(IYBEG2+I) B2 = X(I2+I) Y2 = Y2 - B1 * A9 Y1 = Y1 - B2 * A2 B3 = X(I3+I) Y2 = Y2 - B2 * A10 Y1 = Y1 - B3 * A3 B4 = X(I4+I) Y2 = Y2 - B3 * A11 Y1 = Y1 - B4 * A4 B5 = X(I5+I) Y2 = Y2 - B4 * A12 Y1 = Y1 - B5 * A5 B6 = X(I6+I) Y2 = Y2 - B5 * A13 Y1 = Y1 - B6 * A6 B7 = X(I7+I) Y2 = Y2 - B6 * A14 Y1 = Y1 - B7 * A7 Y(IYBEG1+I) = Y1 Y2 = Y2 - B7 * A15 Y(IYBEG2+I) = Y2 600 CONTINUE C GO TO 2000 C 700 CONTINUE C C --------------------------------- C SIX COLUMNS UPDATING TWO COLUMNS. C --------------------------------- C I1 = XPNT(K+1) - MM I2 = XPNT(K+2) - MM I3 = XPNT(K+3) - MM I4 = XPNT(K+4) - MM I5 = XPNT(K+5) - MM I6 = XPNT(K+6) - MM A1 = X(I1) A2 = X(I2) A3 = X(I3) A4 = X(I4) A5 = X(I5) A6 = X(I6) A9 = X(I1+1) A10 = X(I2+1) A11 = X(I3+1) A12 = X(I4+1) A13 = X(I5+1) A14 = X(I6+1) C Y(IYBEG1+1) = Y(IYBEG1+1) - & A1*A9 - A2*A10 - A3*A11 - A4*A12 - A5*A13 - & A6*A14 C Y(IYBEG2+1) = Y(IYBEG2+1) - & A9*A9 - A10*A10 - A11*A11 - A12*A12 - A13*A13 - & A14*A14 C DO 800 I = 2, MM-1 Y1 = Y(IYBEG1+I) B1 = X(I1+I) Y1 = Y1 - B1 * A1 Y2 = Y(IYBEG2+I) B2 = X(I2+I) Y2 = Y2 - B1 * A9 Y1 = Y1 - B2 * A2 B3 = X(I3+I) Y2 = Y2 - B2 * A10 Y1 = Y1 - B3 * A3 B4 = X(I4+I) Y2 = Y2 - B3 * A11 Y1 = Y1 - B4 * A4 B5 = X(I5+I) Y2 = Y2 - B4 * A12 Y1 = Y1 - B5 * A5 B6 = X(I6+I) Y2 = Y2 - B5 * A13 Y1 = Y1 - B6 * A6 Y(IYBEG1+I) = Y1 Y2 = Y2 - B6 * A14 Y(IYBEG2+I) = Y2 800 CONTINUE C GO TO 2000 C 900 CONTINUE C C ---------------------------------- C FIVE COLUMNS UPDATING TWO COLUMNS. C ---------------------------------- C I1 = XPNT(K+1) - MM I2 = XPNT(K+2) - MM I3 = XPNT(K+3) - MM I4 = XPNT(K+4) - MM I5 = XPNT(K+5) - MM A1 = X(I1) A2 = X(I2) A3 = X(I3) A4 = X(I4) A5 = X(I5) A9 = X(I1+1) A10 = X(I2+1) A11 = X(I3+1) A12 = X(I4+1) A13 = X(I5+1) C Y(IYBEG1+1) = Y(IYBEG1+1) - & A1*A9 - A2*A10 - A3*A11 - A4*A12 - A5*A13 C Y(IYBEG2+1) = Y(IYBEG2+1) - & A9*A9 - A10*A10 - A11*A11 - A12*A12 - A13*A13 C DO 1000 I = 2, MM-1 Y1 = Y(IYBEG1+I) B1 = X(I1+I) Y1 = Y1 - B1 * A1 Y2 = Y(IYBEG2+I) B2 = X(I2+I) Y2 = Y2 - B1 * A9 Y1 = Y1 - B2 * A2 B3 = X(I3+I) Y2 = Y2 - B2 * A10 Y1 = Y1 - B3 * A3 B4 = X(I4+I) Y2 = Y2 - B3 * A11 Y1 = Y1 - B4 * A4 B5 = X(I5+I) Y2 = Y2 - B4 * A12 Y1 = Y1 - B5 * A5 Y(IYBEG1+I) = Y1 Y2 = Y2 - B5 * A13 Y(IYBEG2+I) = Y2 1000 CONTINUE C GO TO 2000 C 1100 CONTINUE C C ---------------------------------- C FOUR COLUMNS UPDATING TWO COLUMNS. C ---------------------------------- C I1 = XPNT(K+1) - MM I2 = XPNT(K+2) - MM I3 = XPNT(K+3) - MM I4 = XPNT(K+4) - MM A1 = X(I1) A2 = X(I2) A3 = X(I3) A4 = X(I4) A9 = X(I1+1) A10 = X(I2+1) A11 = X(I3+1) A12 = X(I4+1) C Y(IYBEG1+1) = Y(IYBEG1+1) - & A1*A9 - A2*A10 - A3*A11 - A4*A12 C Y(IYBEG2+1) = Y(IYBEG2+1) - & A9*A9 - A10*A10 - A11*A11 - A12*A12 C DO 1200 I = 2, MM-1 Y1 = Y(IYBEG1+I) B1 = X(I1+I) Y1 = Y1 - B1 * A1 Y2 = Y(IYBEG2+I) B2 = X(I2+I) Y2 = Y2 - B1 * A9 Y1 = Y1 - B2 * A2 B3 = X(I3+I) Y2 = Y2 - B2 * A10 Y1 = Y1 - B3 * A3 B4 = X(I4+I) Y2 = Y2 - B3 * A11 Y1 = Y1 - B4 * A4 Y(IYBEG1+I) = Y1 Y2 = Y2 - B4 * A12 Y(IYBEG2+I) = Y2 1200 CONTINUE C GO TO 2000 C 1300 CONTINUE C C ----------------------------------- C THREE COLUMNS UPDATING TWO COLUMNS. C ----------------------------------- C I1 = XPNT(K+1) - MM I2 = XPNT(K+2) - MM I3 = XPNT(K+3) - MM A1 = X(I1) A2 = X(I2) A3 = X(I3) A9 = X(I1+1) A10 = X(I2+1) A11 = X(I3+1) C Y(IYBEG1+1) = Y(IYBEG1+1) - & A1*A9 - A2*A10 - A3*A11 C Y(IYBEG2+1) = Y(IYBEG2+1) - & A9*A9 - A10*A10 - A11*A11 C DO 1400 I = 2, MM-1 Y1 = Y(IYBEG1+I) B1 = X(I1+I) Y1 = Y1 - B1 * A1 Y2 = Y(IYBEG2+I) B2 = X(I2+I) Y2 = Y2 - B1 * A9 Y1 = Y1 - B2 * A2 B3 = X(I3+I) Y2 = Y2 - B2 * A10 Y1 = Y1 - B3 * A3 Y(IYBEG1+I) = Y1 Y2 = Y2 - B3 * A11 Y(IYBEG2+I) = Y2 1400 CONTINUE C GO TO 2000 C 1500 CONTINUE C C --------------------------------- C TWO COLUMNS UPDATING TWO COLUMNS. C --------------------------------- C I1 = XPNT(K+1) - MM I2 = XPNT(K+2) - MM A1 = X(I1) A2 = X(I2) A9 = X(I1+1) A10 = X(I2+1) C Y(IYBEG1+1) = Y(IYBEG1+1) - & A1*A9 - A2*A10 C Y(IYBEG2+1) = Y(IYBEG2+1) - & A9*A9 - A10*A10 C DO 1600 I = 2, MM-1 Y1 = Y(IYBEG1+I) B1 = X(I1+I) Y1 = Y1 - B1 * A1 Y2 = Y(IYBEG2+I) B2 = X(I2+I) Y2 = Y2 - B1 * A9 Y1 = Y1 - B2 * A2 Y(IYBEG1+I) = Y1 Y2 = Y2 - B2 * A10 Y(IYBEG2+I) = Y2 1600 CONTINUE C GO TO 2000 C 1700 CONTINUE C C -------------------------------- C ONE COLUMN UPDATING TWO COLUMNS. C -------------------------------- C I1 = XPNT(K+1) - MM A1 = X(I1) A9 = X(I1+1) C Y(IYBEG1+1) = Y(IYBEG1+1) - & A1*A9 C Y(IYBEG2+1) = Y(IYBEG2+1) - & A9*A9 C DO 1800 I = 2, MM-1 Y1 = Y(IYBEG1+I) B1 = X(I1+I) Y1 = Y1 - B1 * A1 Y2 = Y(IYBEG2+I) Y(IYBEG1+I) = Y1 Y2 = Y2 - B1 * A9 Y(IYBEG2+I) = Y2 1800 CONTINUE C GO TO 2000 C C ----------------------------------------------- C PREPARE FOR NEXT PAIR OF COLUMNS TO BE UPDATED. C ----------------------------------------------- C 2000 CONTINUE MM = MM - 2 IYBEG = IYBEG2 + LENY + 1 LENY = LENY - 2 C 3000 CONTINUE C C ----------------------------------------------------- C BOUNDARY CODE FOR J LOOP: EXECUTED WHENVER Q IS ODD. C ----------------------------------------------------- C IF ( J .EQ. QQ ) THEN CALL SMXPY8 ( MM, N, Y(IYBEG), XPNT, X ) ENDIF C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C************* MMPYI .... MATRIX-MATRIX MULTIPLY ************** C*********************************************************************** C*********************************************************************** C C PURPOSE - C THIS ROUTINE PERFORMS A MATRIX-MATRIX MULTIPLY, Y = Y + XA, C ASSUMING DATA STRUCTURES USED IN SOME OF OUR SPARSE CHOLESKY C CODES. C C MATRIX X HAS ONLY 1 COLUMN. C C INPUT PARAMETERS - C M - NUMBER OF ROWS IN X AND IN Y. C Q - NUMBER OF COLUMNS IN A AND Y. C XPNT(*) - XPNT(J+1) POINTS ONE LOCATION BEYOND THE C END OF THE J-TH COLUMN OF X. XPNT IS ALSO C USED TO ACCESS THE ROWS OF A. C X(*) - CONTAINS THE COLUMNS OF X AND THE ROWS OF A. C IY(*) - IY(COL) POINTS TO THE BEGINNING OF COLUMN C RELIND(*) - RELATIVE INDICES. C C UPDATED PARAMETERS - C Y(*) - ON OUTPUT, Y = Y + AX. C C*********************************************************************** C SUBROUTINE MMPYI ( M , Q , XPNT , X , IY , & Y , RELIND ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- C INTEGER M , Q INTEGER IY(*) , RELIND(*) , & XPNT(*) DOUBLE PRECISION X(*) , Y(*) C C ---------------- C LOCAL VARIABLES. C ---------------- C INTEGER COL , I , ISUB , K , YLAST DOUBLE PRECISION A C C*********************************************************************** C DO 200 K = 1, Q COL = XPNT(K) YLAST = IY(COL+1) - 1 A = - X(K) CDIR$ IVDEP DO 100 I = K, M ISUB = XPNT(I) ISUB = YLAST - RELIND(ISUB) Y(ISUB) = Y(ISUB) + A*X(I) 100 CONTINUE 200 CONTINUE RETURN C END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C**** ORDMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE ************ C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE CALLS LIU'S MULTIPLE MINIMUM DEGREE C ROUTINE. C C INPUT PARAMETERS - C NEQNS - NUMBER OF EQUATIONS. C (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. C IWSIZ - SIZE OF INTEGER WORKING STORAGE. C C OUTPUT PARAMETERS - C PERM - THE MINIMUM DEGREE ORDERING. C INVP - THE INVERSE OF PERM. C NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO C SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. C IFLAG - ERROR FLAG. C 0: SUCCESSFUL ORDERING C -1: INSUFFICIENT WORKING STORAGE C [IWORK(*)]. C C WORKING PARAMETERS - C IWORK - INTEGER WORKSPACE OF LENGTH 4*NEQNS. C C*********************************************************************** C SUBROUTINE ORDMMD ( NEQNS , XADJ , ADJNCY, INVP , PERM , 1 IWSIZ , IWORK , NOFSUB, IFLAG ) C C*********************************************************************** C INTEGER ADJNCY(*), INVP(*) , IWORK(*) , PERM(*) INTEGER XADJ(*) INTEGER DELTA , IFLAG , IWSIZ , MAXINT, NEQNS , & NOFSUB C C********************************************************************* C IFLAG = 0 IF ( IWSIZ .LT. 4*NEQNS ) THEN IFLAG = -1 RETURN ENDIF C C DELTA - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. C MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER C (ANY SMALLER ESTIMATE WILL DO) FOR MARKING C NODES. C DELTA = 0 MAXINT = 32767 CALL GENMMD ( NEQNS , XADJ , ADJNCY, INVP , PERM , 1 DELTA , 1 IWORK(1) , 1 IWORK(NEQNS+1) , 1 IWORK(2*NEQNS+1) , 1 IWORK(3*NEQNS+1) , 1 MAXINT, NOFSUB ) RETURN C END C*********************************************************************** C*********************************************************************** C C Version: 0.3 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratoy C C*********************************************************************** C*********************************************************************** C****** PCHOL .... DENSE PARTIAL CHOLESKY ************** C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE PERFORMS CHOLESKY C FACTORIZATION ON THE COLUMNS OF A SUPERNODE C THAT HAVE RECEIVED ALL UPDATES FROM COLUMNS C EXTERNAL TO THE SUPERNODE. C C INPUT PARAMETERS - C M - NUMBER OF ROWS (LENGTH OF THE FIRST COLUMN). C N - NUMBER OF COLUMNS IN THE SUPERNODE. C XPNT - XPNT(J+1) POINTS ONE LOCATION BEYOND THE END C OF THE J-TH COLUMN OF THE SUPERNODE. C X(*) - CONTAINS THE COLUMNS OF OF THE SUPERNODE TO C BE FACTORED. C C EXTERNAL ROUTINE: C SMXPY8 - MATRIX-VECTOR MULTIPLY WITH 8 LOOP UNROLLING. C C OUTPUT PARAMETERS - C X(*) - ON OUTPUT, CONTAINS THE FACTORED COLUMNS OF C THE SUPERNODE. C C*********************************************************************** C SUBROUTINE PCHOL ( M, N, XPNT, X, MXDIAG, NTINY ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- C EXTERNAL SMXPY8 C INTEGER M, N C INTEGER XPNT(*) C CxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPC DOUBLE PRECISION X(*), MXDIAG INTEGER NTINY C C ---------------- C LOCAL VARIABLES. C ---------------- C INTEGER JPNT , JCOL , MM C DOUBLE PRECISION DIAG C C*********************************************************************** C C ------------------------------------------ C FOR EVERY COLUMN JCOL IN THE SUPERNODE ... C ------------------------------------------ MM = M JPNT = XPNT(1) DO 100 JCOL = 1, N C C ---------------------------------- C UPDATE JCOL WITH PREVIOUS COLUMNS. C ---------------------------------- IF ( JCOL .GT. 1 ) THEN CALL SMXPY8 ( MM, JCOL-1, X(JPNT), XPNT, X ) ENDIF C C --------------------------- C COMPUTE THE DIAGONAL ENTRY. C --------------------------- DIAG = X(JPNT) CxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPC IF (DIAG .LE. 1.0D-30*MXDIAG) THEN DIAG = 1.0D+128 NTINY = NTINY+1 ENDIF CxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPCxPC DIAG = SQRT ( DIAG ) X(JPNT) = DIAG DIAG = 1.0D+00 / DIAG C C ---------------------------------------------------- C SCALE COLUMN JCOL WITH RECIPROCAL OF DIAGONAL ENTRY. C ---------------------------------------------------- MM = MM - 1 JPNT = JPNT + 1 CALL DSCAL1 ( MM, DIAG, X(JPNT) ) JPNT = JPNT + MM C 100 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: January 12, 1995 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C************** SFINIT ..... SET UP FOR SYMB. FACT. ************ C*********************************************************************** C*********************************************************************** C C PURPOSE: C THIS SUBROUTINE COMPUTES THE STORAGE REQUIREMENTS AND SETS UP C PRELIMINARY DATA STRUCTURES FOR THE SYMBOLIC FACTORIZATION. C C NOTE: C THIS VERSION PRODUCES THE MAXIMAL SUPERNODE PARTITION (I.E., C THE ONE WITH THE FEWEST POSSIBLE SUPERNODES). C C INPUT PARAMETERS: C NEQNS - NUMBER OF EQUATIONS. C NNZA - LENGTH OF ADJACENCY STRUCTURE. C XADJ(*) - ARRAY OF LENGTH NEQNS+1, CONTAINING POINTERS C TO THE ADJACENCY STRUCTURE. C ADJNCY(*) - ARRAY OF LENGTH XADJ(NEQNS+1)-1, CONTAINING C THE ADJACENCY STRUCTURE. C PERM(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE C POSTORDERING. C INVP(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE C INVERSE OF THE POSTORDERING. C IWSIZ - SIZE OF INTEGER WORKING STORAGE. C C OUTPUT PARAMETERS: C COLCNT(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE NUMBER C OF NONZEROS IN EACH COLUMN OF THE FACTOR, C INCLUDING THE DIAGONAL ENTRY. C NNZL - NUMBER OF NONZEROS IN THE FACTOR, INCLUDING C THE DIAGONAL ENTRIES. C NSUB - NUMBER OF SUBSCRIPTS. C NSUPER - NUMBER OF SUPERNODES (<= NEQNS). C SNODE(*) - ARRAY OF LENGTH NEQNS FOR RECORDING C SUPERNODE MEMBERSHIP. C XSUPER(*) - ARRAY OF LENGTH NEQNS+1, CONTAINING THE C SUPERNODE PARTITIONING. C IFLAG(*) - ERROR FLAG. C 0: SUCCESSFUL SF INITIALIZATION. C -1: INSUFFICENT WORKING STORAGE C [IWORK(*)]. C C WORK PARAMETERS: C IWORK(*) - INTEGER WORK ARRAY OF LENGTH 7*NEQNS+3. C C FIRST CREATED ON NOVEMEBER 14, 1994. C LAST UPDATED ON January 12, 1995. C C*********************************************************************** C SUBROUTINE SFINIT ( NEQNS , NNZA , XADJ , ADJNCY, PERM , & INVP , COLCNT, NNZL , NSUB , NSUPER, & SNODE , XSUPER, IWSIZ , IWORK , IFLAG ) C C ----------- C PARAMETERS. C ----------- INTEGER IFLAG , IWSIZ , NNZA , NEQNS , NNZL , & NSUB , NSUPER INTEGER ADJNCY(NNZA) , COLCNT(NEQNS) , & INVP(NEQNS) , IWORK(7*NEQNS+3), & PERM(NEQNS) , SNODE(NEQNS) , & XADJ(NEQNS+1) , XSUPER(NEQNS+1) C C*********************************************************************** C C -------------------------------------------------------- C RETURN IF THERE IS INSUFFICIENT INTEGER WORKING STORAGE. C -------------------------------------------------------- IFLAG = 0 IF ( IWSIZ .LT. 7*NEQNS+3 ) THEN IFLAG = -1 RETURN ENDIF C C ------------------------------------------ C COMPUTE ELIMINATION TREE AND POSTORDERING. C ------------------------------------------ CALL ETORDR ( NEQNS , XADJ , ADJNCY, PERM , INVP , & IWORK(1) , & IWORK(NEQNS+1) , & IWORK(2*NEQNS+1) , & IWORK(3*NEQNS+1) ) C C --------------------------------------------- C COMPUTE ROW AND COLUMN FACTOR NONZERO COUNTS. C --------------------------------------------- CALL FCNTHN ( NEQNS , NNZA , XADJ , ADJNCY, PERM , & INVP , IWORK(1) , SNODE , COLCNT, & NNZL , & IWORK(NEQNS+1) , & IWORK(2*NEQNS+1) , & XSUPER , & IWORK(3*NEQNS+1) , & IWORK(4*NEQNS+2) , & IWORK(5*NEQNS+3) , & IWORK(6*NEQNS+4) ) C C --------------------------------------------------------- C REARRANGE CHILDREN SO THAT THE LAST CHILD HAS THE MAXIMUM C NUMBER OF NONZEROS IN ITS COLUMN OF L. C --------------------------------------------------------- CALL CHORDR ( NEQNS , PERM , INVP , & COLCNT, & IWORK(1) , & IWORK(NEQNS+1) , & IWORK(2*NEQNS+1) , & IWORK(3*NEQNS+1) ) C C ---------------- C FIND SUPERNODES. C ---------------- CALL FSUP1 ( NEQNS , IWORK(1) , COLCNT, NSUB , & NSUPER, SNODE ) CALL FSUP2 ( NEQNS , NSUPER, SNODE, XSUPER ) C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: December 27, 1994 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C****** SMXPY8 .... MATRIX-VECTOR MULTIPLY ************** C*********************************************************************** C*********************************************************************** C C PURPOSE - THIS ROUTINE PERFORMS A MATRIX-VECTOR MULTIPLY, C Y = Y + AX, ASSUMING DATA STRUCTURES USED IN C RECENTLY DEVELOPED SPARSE CHOLESKY CODES. THE C '8' SIGNIFIES LEVEL 8 LOOP UNROLLING. C C INPUT PARAMETERS - C M - NUMBER OF ROWS. C N - NUMBER OF COLUMNS. C Y - M-VECTOR TO WHICH AX WILL BE ADDED. C APNT - INDEX VECTOR FOR A. APNT(I) POINTS TO THE C FIRST NONZERO IN COLUMN I OF A. C Y - ON OUTPUT, CONTAINS Y = Y + AX. C C*********************************************************************** C SUBROUTINE SMXPY8 ( M, N, Y, APNT, A ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- C INTEGER M, N, LEVEL C INTEGER APNT(*) C DOUBLE PRECISION Y(*), A(*) C PARAMETER ( LEVEL = 8 ) C C ---------------- C LOCAL VARIABLES. C ---------------- C INTEGER I, I1, I2, I3, I4, I5, I6, I7, I8, & J, REMAIN C DOUBLE PRECISION A1, A2, A3, A4, A5, A6, A7, A8 C C*********************************************************************** C REMAIN = MOD ( N, LEVEL ) C C GO TO ( 2000, 100, 200, 300, C & 400, 500, 600, 700 ), REMAIN+1 if (REMAIN .eq. 0) go to 2000 C if (REMAIN .eq. 1) go to 100 if (REMAIN .eq. 2) go to 200 if (REMAIN .eq. 3) go to 300 if (REMAIN .eq. 4) go to 400 if (REMAIN .eq. 5) go to 500 if (REMAIN .eq. 6) go to 600 if (REMAIN .eq. 7) go to 700 C C 100 CONTINUE I1 = APNT(1+1) - M A1 = - A(I1) DO 150 I = 1, M Y(I) = Y(I) + A1*A(I1) I1 = I1 + 1 150 CONTINUE GO TO 2000 C 200 CONTINUE I1 = APNT(1+1) - M I2 = APNT(1+2) - M A1 = - A(I1) A2 = - A(I2) DO 250 I = 1, M Y(I) = ( (Y(I)) & + A1*A(I1)) + A2*A(I2) I1 = I1 + 1 I2 = I2 + 1 250 CONTINUE GO TO 2000 C 300 CONTINUE I1 = APNT(1+1) - M I2 = APNT(1+2) - M I3 = APNT(1+3) - M A1 = - A(I1) A2 = - A(I2) A3 = - A(I3) DO 350 I = 1, M Y(I) = (( (Y(I)) & + A1*A(I1)) + A2*A(I2)) & + A3*A(I3) I1 = I1 + 1 I2 = I2 + 1 I3 = I3 + 1 350 CONTINUE GO TO 2000 C 400 CONTINUE I1 = APNT(1+1) - M I2 = APNT(1+2) - M I3 = APNT(1+3) - M I4 = APNT(1+4) - M A1 = - A(I1) A2 = - A(I2) A3 = - A(I3) A4 = - A(I4) DO 450 I = 1, M Y(I) = ((( (Y(I)) & + A1*A(I1)) + A2*A(I2)) & + A3*A(I3)) + A4*A(I4) I1 = I1 + 1 I2 = I2 + 1 I3 = I3 + 1 I4 = I4 + 1 450 CONTINUE GO TO 2000 C 500 CONTINUE I1 = APNT(1+1) - M I2 = APNT(1+2) - M I3 = APNT(1+3) - M I4 = APNT(1+4) - M I5 = APNT(1+5) - M A1 = - A(I1) A2 = - A(I2) A3 = - A(I3) A4 = - A(I4) A5 = - A(I5) DO 550 I = 1, M Y(I) = (((( (Y(I)) & + A1*A(I1)) + A2*A(I2)) & + A3*A(I3)) + A4*A(I4)) & + A5*A(I5) I1 = I1 + 1 I2 = I2 + 1 I3 = I3 + 1 I4 = I4 + 1 I5 = I5 + 1 550 CONTINUE GO TO 2000 C 600 CONTINUE I1 = APNT(1+1) - M I2 = APNT(1+2) - M I3 = APNT(1+3) - M I4 = APNT(1+4) - M I5 = APNT(1+5) - M I6 = APNT(1+6) - M A1 = - A(I1) A2 = - A(I2) A3 = - A(I3) A4 = - A(I4) A5 = - A(I5) A6 = - A(I6) DO 650 I = 1, M Y(I) = ((((( (Y(I)) & + A1*A(I1)) + A2*A(I2)) & + A3*A(I3)) + A4*A(I4)) & + A5*A(I5)) + A6*A(I6) I1 = I1 + 1 I2 = I2 + 1 I3 = I3 + 1 I4 = I4 + 1 I5 = I5 + 1 I6 = I6 + 1 650 CONTINUE GO TO 2000 C 700 CONTINUE I1 = APNT(1+1) - M I2 = APNT(1+2) - M I3 = APNT(1+3) - M I4 = APNT(1+4) - M I5 = APNT(1+5) - M I6 = APNT(1+6) - M I7 = APNT(1+7) - M A1 = - A(I1) A2 = - A(I2) A3 = - A(I3) A4 = - A(I4) A5 = - A(I5) A6 = - A(I6) A7 = - A(I7) DO 750 I = 1, M Y(I) = (((((( (Y(I)) & + A1*A(I1)) + A2*A(I2)) & + A3*A(I3)) + A4*A(I4)) & + A5*A(I5)) + A6*A(I6)) & + A7*A(I7) I1 = I1 + 1 I2 = I2 + 1 I3 = I3 + 1 I4 = I4 + 1 I5 = I5 + 1 I6 = I6 + 1 I7 = I7 + 1 750 CONTINUE GO TO 2000 C 2000 CONTINUE DO 4000 J = REMAIN+1, N, LEVEL I1 = APNT(J+1) - M I2 = APNT(J+2) - M I3 = APNT(J+3) - M I4 = APNT(J+4) - M I5 = APNT(J+5) - M I6 = APNT(J+6) - M I7 = APNT(J+7) - M I8 = APNT(J+8) - M A1 = - A(I1) A2 = - A(I2) A3 = - A(I3) A4 = - A(I4) A5 = - A(I5) A6 = - A(I6) A7 = - A(I7) A8 = - A(I8) DO 3000 I = 1, M Y(I) = ((((((( (Y(I)) & + A1*A(I1)) + A2*A(I2)) & + A3*A(I3)) + A4*A(I4)) & + A5*A(I5)) + A6*A(I6)) & + A7*A(I7)) + A8*A(I8) I1 = I1 + 1 I2 = I2 + 1 I3 = I3 + 1 I4 = I4 + 1 I5 = I5 + 1 I6 = I6 + 1 I7 = I7 + 1 I8 = I8 + 1 3000 CONTINUE 4000 CONTINUE C RETURN END C*********************************************************************** C*********************************************************************** C C Version: 0.4 C Last modified: February 13, 1995 C Authors: Esmond G. Ng and Barry W. Peyton C C Mathematical Sciences Section, Oak Ridge National Laboratory C C*********************************************************************** C*********************************************************************** C************* SYMFC2 ..... SYMBOLIC FACTORIZATION ************** C*********************************************************************** C*********************************************************************** C C PURPOSE: C THIS ROUTINE PERFORMS SUPERNODAL SYMBOLIC FACTORIZATION ON A C REORDERED LINEAR SYSTEM. IT ASSUMES ACCESS TO THE COLUMNS C COUNTS, SUPERNODE PARTITION, AND SUPERNODAL ELIMINATION TREE C ASSOCIATED WITH THE FACTOR MATRIX L. C C INPUT PARAMETERS: C (I) NEQNS - NUMBER OF EQUATIONS C (I) ADJLEN - LENGTH OF THE ADJACENCY LIST. C (I) XADJ(*) - ARRAY OF LENGTH NEQNS+1 CONTAINING POINTERS C TO THE ADJACENCY STRUCTURE. C (I) ADJNCY(*) - ARRAY OF LENGTH XADJ(NEQNS+1)-1 CONTAINING C THE ADJACENCY STRUCTURE. C (I) PERM(*) - ARRAY OF LENGTH NEQNS CONTAINING THE C POSTORDERING. C (I) INVP(*) - ARRAY OF LENGTH NEQNS CONTAINING THE C INVERSE OF THE POSTORDERING. C (I) COLCNT(*) - ARRAY OF LENGTH NEQNS, CONTAINING THE NUMBER C OF NONZEROS IN EACH COLUMN OF THE FACTOR, C INCLUDING THE DIAGONAL ENTRY. C (I) NSUPER - NUMBER OF SUPERNODES. C (I) XSUPER(*) - ARRAY OF LENGTH NSUPER+1, CONTAINING THE C FIRST COLUMN OF EACH SUPERNODE. C (I) SNODE(*) - ARRAY OF LENGTH NEQNS FOR RECORDING C SUPERNODE MEMBERSHIP. C (I) NOFSUB - NUMBER OF SUBSCRIPTS TO BE STORED IN C LINDX(*). C C OUTPUT PARAMETERS: C (I) XLINDX - ARRAY OF LENGTH NEQNS+1, CONTAINING POINTERS C INTO THE SUBSCRIPT VECTOR. C (I) LINDX - ARRAY OF LENGTH MAXSUB, CONTAINING THE C COMPRESSED SUBSCRIPTS. C (I) XLNZ - COLUMN POINTERS FOR L. C (I) FLAG - ERROR FLAG: C 0 - NO ERROR. C 1 - INCONSISTANCY IN THE INPUT. C C WORKING PARAMETERS: C (I) MRGLNK - ARRAY OF LENGTH NSUPER, CONTAINING THE C CHILDREN OF EACH SUPERNODE AS A LINKED LIST. C (I) RCHLNK - ARRAY OF LENGTH NEQNS+1, CONTAINING THE C CURRENT LINKED LIST OF MERGED INDICES (THE C "REACH" SET). C (I) MARKER - ARRAY OF LENGTH NEQNS USED TO MARK INDICES C AS THEY ARE INTRODUCED INTO EACH SUPERNODE'S C INDEX SET. C C*********************************************************************** C SUBROUTINE SYMFC2 ( NEQNS , ADJLEN, XADJ , ADJNCY, PERM , & INVP , COLCNT, NSUPER, XSUPER, SNODE , & NOFSUB, XLINDX, LINDX , XLNZ , MRGLNK, & RCHLNK, MARKER, FLAG ) C C*********************************************************************** C C ----------- C PARAMETERS. C ----------- INTEGER ADJLEN, FLAG , NEQNS , NOFSUB, NSUPER INTEGER ADJNCY(ADJLEN), COLCNT(NEQNS) , & INVP(NEQNS) , MARKER(NEQNS) , & MRGLNK(NSUPER), LINDX(NOFSUB) , & PERM(NEQNS) , RCHLNK(0:NEQNS), & SNODE(NEQNS) , XSUPER(NSUPER+1) INTEGER XADJ(NEQNS+1) , XLINDX(NSUPER+1), & XLNZ(NEQNS+1) C C ---------------- C LOCAL VARIABLES. C ---------------- INTEGER FSTCOL, HEAD , I , JNZBEG, JNZEND, & JPTR , JSUP , JWIDTH, KNZ , KNZBEG, & KNZEND, KPTR , KSUP , LENGTH, LSTCOL, & NEWI , NEXTI , NODE , NZBEG , NZEND , & PCOL , PSUP , POINT , TAIL , WIDTH C C*********************************************************************** C FLAG = 0 IF ( NEQNS .LE. 0 ) RETURN C C --------------------------------------------------- C INITIALIZATIONS ... C NZEND : POINTS TO THE LAST USED SLOT IN LINDX. C TAIL : END OF LIST INDICATOR C (IN RCHLNK(*), NOT MRGLNK(*)). C MRGLNK : CREATE EMPTY LISTS. C MARKER : "UNMARK" THE INDICES. C --------------------------------------------------- NZEND = 0 HEAD = 0 TAIL = NEQNS + 1 POINT = 1 DO 50 I = 1, NEQNS MARKER(I) = 0 XLNZ(I) = POINT POINT = POINT + COLCNT(I) 50 CONTINUE XLNZ(NEQNS+1) = POINT POINT = 1 DO 100 KSUP = 1, NSUPER MRGLNK(KSUP) = 0 FSTCOL = XSUPER(KSUP) XLINDX(KSUP) = POINT POINT = POINT + COLCNT(FSTCOL) 100 CONTINUE XLINDX(NSUPER+1) = POINT C C --------------------------- C FOR EACH SUPERNODE KSUP ... C --------------------------- DO 1000 KSUP = 1, NSUPER C C --------------------------------------------------------- C INITIALIZATIONS ... C FSTCOL : FIRST COLUMN OF SUPERNODE KSUP. C LSTCOL : LAST COLUMN OF SUPERNODE KSUP. C KNZ : WILL COUNT THE NONZEROS OF L IN COLUMN KCOL. C RCHLNK : INITIALIZE EMPTY INDEX LIST FOR KCOL. C --------------------------------------------------------- FSTCOL = XSUPER(KSUP) LSTCOL = XSUPER(KSUP+1) - 1 WIDTH = LSTCOL - FSTCOL + 1 LENGTH = COLCNT(FSTCOL) KNZ = 0 RCHLNK(HEAD) = TAIL JSUP = MRGLNK(KSUP) C C ------------------------------------------------- C IF KSUP HAS CHILDREN IN THE SUPERNODAL E-TREE ... C ------------------------------------------------- IF ( JSUP .GT. 0 ) THEN C --------------------------------------------- C COPY THE INDICES OF THE FIRST CHILD JSUP INTO C THE LINKED LIST, AND MARK EACH WITH THE VALUE C KSUP. C --------------------------------------------- JWIDTH = XSUPER(JSUP+1) - XSUPER(JSUP) JNZBEG = XLINDX(JSUP) + JWIDTH JNZEND = XLINDX(JSUP+1) - 1 DO 200 JPTR = JNZEND, JNZBEG, -1 NEWI = LINDX(JPTR) KNZ = KNZ+1 MARKER(NEWI) = KSUP RCHLNK(NEWI) = RCHLNK(HEAD) RCHLNK(HEAD) = NEWI 200 CONTINUE C ------------------------------------------ C FOR EACH SUBSEQUENT CHILD JSUP OF KSUP ... C ------------------------------------------ JSUP = MRGLNK(JSUP) 300 CONTINUE IF ( JSUP .NE. 0 .AND. KNZ .LT. LENGTH ) THEN C ---------------------------------------- C MERGE THE INDICES OF JSUP INTO THE LIST, C AND MARK NEW INDICES WITH VALUE KSUP. C ---------------------------------------- JWIDTH = XSUPER(JSUP+1) - XSUPER(JSUP) JNZBEG = XLINDX(JSUP) + JWIDTH JNZEND = XLINDX(JSUP+1) - 1 NEXTI = HEAD DO 500 JPTR = JNZBEG, JNZEND NEWI = LINDX(JPTR) 400 CONTINUE I = NEXTI NEXTI = RCHLNK(I) IF ( NEWI .GT. NEXTI ) GO TO 400 IF ( NEWI .LT. NEXTI ) THEN KNZ = KNZ+1 RCHLNK(I) = NEWI RCHLNK(NEWI) = NEXTI MARKER(NEWI) = KSUP NEXTI = NEWI ENDIF 500 CONTINUE JSUP = MRGLNK(JSUP) GO TO 300 ENDIF ENDIF C --------------------------------------------------- C STRUCTURE OF A(*,FSTCOL) HAS NOT BEEN EXAMINED YET. C "SORT" ITS STRUCTURE INTO THE LINKED LIST, C INSERTING ONLY THOSE INDICES NOT ALREADY IN THE C LIST. C --------------------------------------------------- IF ( KNZ .LT. LENGTH ) THEN NODE = PERM(FSTCOL) KNZBEG = XADJ(NODE) KNZEND = XADJ(NODE+1) - 1 DO 700 KPTR = KNZBEG, KNZEND NEWI = ADJNCY(KPTR) NEWI = INVP(NEWI) IF ( NEWI .GT. FSTCOL .AND. & MARKER(NEWI) .NE. KSUP ) THEN C -------------------------------- C POSITION AND INSERT NEWI IN LIST C AND MARK IT WITH KCOL. C -------------------------------- NEXTI = HEAD 600 CONTINUE I = NEXTI NEXTI = RCHLNK(I) IF ( NEWI .GT. NEXTI ) GO TO 600 KNZ = KNZ + 1 RCHLNK(I) = NEWI RCHLNK(NEWI) = NEXTI MARKER(NEWI) = KSUP ENDIF 700 CONTINUE ENDIF C ------------------------------------------------------------ C IF KSUP HAS NO CHILDREN, INSERT FSTCOL INTO THE LINKED LIST. C ------------------------------------------------------------ IF ( RCHLNK(HEAD) .NE. FSTCOL ) THEN RCHLNK(FSTCOL) = RCHLNK(HEAD) RCHLNK(HEAD) = FSTCOL KNZ = KNZ + 1 ENDIF C C -------------------------------------------- C COPY INDICES FROM LINKED LIST INTO LINDX(*). C -------------------------------------------- NZBEG = NZEND + 1 NZEND = NZEND + KNZ IF ( NZEND+1 .NE. XLINDX(KSUP+1) ) GO TO 8000 I = HEAD DO 800 KPTR = NZBEG, NZEND I = RCHLNK(I) LINDX(KPTR) = I 800 CONTINUE C C --------------------------------------------------- C IF KSUP HAS A PARENT, INSERT KSUP INTO ITS PARENT'S C "MERGE" LIST. C --------------------------------------------------- IF ( LENGTH .GT. WIDTH ) THEN PCOL = LINDX ( XLINDX(KSUP) + WIDTH ) PSUP = SNODE(PCOL) MRGLNK(KSUP) = MRGLNK(PSUP) MRGLNK(PSUP) = KSUP ENDIF C 1000 CONTINUE C RETURN C C ----------------------------------------------- C INCONSISTENCY IN DATA STRUCTURE WAS DISCOVERED. C ----------------------------------------------- 8000 CONTINUE FLAG = -2 RETURN C END subroutine genrcm ( node_num, adj_num, adj_row, adj, perm ) !*****************************************************************************80 ! !! GENRCM finds the reverse Cuthill-Mckee ordering for a general graph. ! ! Discussion: ! ! For each connected component in the graph, the routine obtains ! an ordering by calling RCM. ! ! Modified: ! ! 04 January 2003 ! ! Author: ! ! Alan George, Joseph Liu ! FORTRAN90 version by John Burkardt ! ! Reference: ! ! Alan George, Joseph Liu, ! Computer Solution of Large Sparse Positive Definite Systems, ! Prentice Hall, 1981. ! ! Parameters: ! ! Input, integer NODE_NUM, the number of nodes. ! ! Input, integer ADJ_NUM, the number of adjacency entries. ! ! Input, integer ADJ_ROW(NODE_NUM+1). Information about row I is stored ! in entries ADJ_ROW(I) through ADJ_ROW(I+1)-1 of ADJ. ! ! Input, integer ADJ(ADJ_NUM), the adjacency structure. ! For each row, it contains the column indices of the nonzero entries. ! ! Output, integer PERM(NODE_NUM), the RCM ordering. ! ! Local Parameters: ! ! Local, integer LEVEL_ROW(NODE_NUM+1), the index vector for a level ! structure. The level structure is stored in the currently unused ! spaces in the permutation vector PERM. ! ! Local, integer MASK(NODE_NUM), marks variables that have been numbered. ! implicit none integer adj_num,node_num integer adj(adj_num) integer adj_row(node_num+1) integer i integer iccsze integer mask(node_num) integer level_num integer level_row(node_num+1) integer num integer perm(node_num) integer root do i=1,node_num mask(i) = 1 enddo num = 1 do i = 1, node_num ! ! For each masked connected component... ! if ( mask(i).ne. 0 ) then root = i ! ! Find a pseudo-peripheral node ROOT. The level structure found by ! ROOT_FIND is stored starting at PERM(NUM). ! call root_find ( root, adj_num, adj_row, adj, mask, & level_num, level_row, perm(num), node_num ) ! ! RCM orders the component using ROOT as the starting node. ! call rcm ( root, adj_num, adj_row, adj, mask, perm(num), & iccsze, node_num ) num = num + iccsze ! ! We can stop once every node is in one of the connected components. ! if ( node_num .lt. num ) then return endif endif enddo return end subroutine rcm ( root, adj_num, adj_row, adj, mask, perm, iccsze, & node_num ) !*****************************************************************************80 ! !! RCM renumbers a connected component by the reverse Cuthill McKee algorithm. ! ! Discussion: ! ! The connected component is specified by a node ROOT and a mask. ! The numbering starts at the root node. ! ! An outline of the algorithm is as follows: ! ! X(1) = ROOT. ! ! for ( I = 1 to N-1) ! Find all unlabeled neighbors of X(I), ! assign them the next available labels, in order of increasing degree. ! ! When done, reverse the ordering. ! ! Modified: ! ! 02 January 2007 ! ! Author: ! ! Alan George, Joseph Liu ! FORTRAN90 version by John Burkardt ! ! Reference: ! ! Alan George, Joseph Liu, ! Computer Solution of Large Sparse Positive Definite Systems, ! Prentice Hall, 1981. ! ! Parameters: ! ! Input, integer ROOT, the node that defines the connected component. ! It is used as the starting point for the RCM ordering. ! ! Input, integer ADJ_NUM, the number of adjacency entries. ! ! Input, integer ADJ_ROW(NODE_NUM+1). Information about row I is stored ! in entries ADJ_ROW(I) through ADJ_ROW(I+1)-1 of ADJ. ! ! Input, integer ADJ(ADJ_NUM), the adjacency structure. ! For each row, it contains the column indices of the nonzero entries. ! ! Input/output, integer MASK(NODE_NUM), a mask for the nodes. Only ! those nodes with nonzero input mask values are considered by the ! routine. The nodes numbered by RCM will have their mask values ! set to zero. ! ! Output, integer PERM(NODE_NUM), the RCM ordering. ! ! Output, integer ICCSZE, the size of the connected component ! that has been numbered. ! ! Input, integer NODE_NUM, the number of nodes. ! ! Local Parameters: ! ! Workspace, integer DEG(NODE_NUM), a temporary vector used to hold ! the degree of the nodes in the section graph specified by mask and root. ! implicit none integer adj_num integer node_num integer adj(adj_num) integer adj_row(node_num+1) integer deg(node_num) integer fnbr integer i integer iccsze integer j integer jstop integer jstrt integer k integer l integer lbegin integer lnbr integer lperm integer lvlend integer mask(node_num) integer nbr integer node integer perm(node_num) integer root ! ! Find the degrees of the nodes in the component specified by MASK and ROOT. ! call degree ( root, adj_num, adj_row, adj, mask, deg, iccsze, & perm, node_num ) mask(root) = 0 if ( iccsze .le. 1 ) then return end if lvlend = 0 lnbr = 1 ! ! LBEGIN and LVLEND point to the beginning and ! the end of the current level respectively. ! do while ( lvlend .lt. lnbr ) lbegin = lvlend + 1 lvlend = lnbr do i = lbegin, lvlend ! ! For each node in the current level... ! node = perm(i) jstrt = adj_row(node) jstop = adj_row(node+1) - 1 ! ! Find the unnumbered neighbors of NODE. ! ! FNBR and LNBR point to the first and last neighbors ! of the current node in PERM. ! fnbr = lnbr + 1 do j = jstrt, jstop nbr = adj(j) if ( mask(nbr) .ne. 0 ) then lnbr = lnbr + 1 mask(nbr) = 0 perm(lnbr) = nbr end if end do ! ! If no neighbors, skip to next node in this level. ! cc if ( lnbr .le. fnbr ) then cc cycle cc end if if ( lnbr .gt. fnbr ) then ! ! Sort the neighbors of NODE in increasing order by degree. ! Linear insertion is used. ! k = fnbr do while ( k .lt. lnbr ) l = k k = k + 1 nbr = perm(k) do while ( fnbr .lt. l ) lperm = perm(l) if ( deg(lperm) .le. deg(nbr) ) then exit end if perm(l+1) = lperm l = l - 1 end do perm(l+1) = nbr end do end if end do end do ! ! We now have the Cuthill-McKee ordering. Reverse it. ! k=iccsze/2 l=iccsze do i=1,k lperm=perm(l) perm(l)=perm(i) perm(i)=lperm l=l-1 enddo return end subroutine root_find ( root, adj_num, adj_row, adj, mask, & level_num, level_row, level, node_num ) !*****************************************************************************80 ! !! ROOT_FIND finds a pseudo-peripheral node. ! ! Discussion: ! ! The diameter of a graph is the maximum distance (number of edges) ! between any two nodes of the graph. ! ! The eccentricity of a node is the maximum distance between that ! node and any other node of the graph. ! ! A peripheral node is a node whose eccentricity equals the ! diameter of the graph. ! ! A pseudo-peripheral node is an approximation to a peripheral node; ! it may be a peripheral node, but all we know is that we tried our ! best. ! ! The routine is given a graph, and seeks pseudo-peripheral nodes, ! using a modified version of the scheme of Gibbs, Poole and ! Stockmeyer. It determines such a node for the section subgraph ! specified by MASK and ROOT. ! ! The routine also determines the level structure associated with ! the given pseudo-peripheral node; that is, how far each node ! is from the pseudo-peripheral node. The level structure is ! returned as a list of nodes LS, and pointers to the beginning ! of the list of nodes that are at a distance of 0, 1, 2, ..., ! NODE_NUM-1 from the pseudo-peripheral node. ! ! Modified: ! ! 28 October 2003 ! ! Author: ! ! Alan George, Joseph Liu ! FORTRAN90 version by John Burkardt ! ! Reference: ! ! Alan George, Joseph Liu, ! Computer Solution of Large Sparse Positive Definite Systems, ! Prentice Hall, 1981. ! ! Norman Gibbs, William Poole, Paul Stockmeyer, ! An Algorithm for Reducing the Bandwidth and Profile of a Sparse Matrix, ! SIAM Journal on Numerical Analysis, ! Volume 13, pages 236-250, 1976. ! ! Norman Gibbs, ! Algorithm 509: A Hybrid Profile Reduction Algorithm, ! ACM Transactions on Mathematical Software, ! Volume 2, pages 378-387, 1976. ! ! Parameters: ! ! Input/output, integer ROOT. On input, ROOT is a node in the ! the component of the graph for which a pseudo-peripheral node is ! sought. On output, ROOT is the pseudo-peripheral node obtained. ! ! Input, integer ADJ_NUM, the number of adjacency entries. ! ! Input, integer ADJ_ROW(NODE_NUM+1). Information about row I is stored ! in entries ADJ_ROW(I) through ADJ_ROW(I+1)-1 of ADJ. ! ! Input, integer ADJ(ADJ_NUM), the adjacency structure. ! For each row, it contains the column indices of the nonzero entries. ! ! Input, integer MASK(NODE_NUM), specifies a section subgraph. Nodes ! for which MASK is zero are ignored by FNROOT. ! ! Output, integer LEVEL_NUM, is the number of levels in the level structure ! rooted at the node ROOT. ! ! Output, integer LEVEL_ROW(NODE_NUM+1), LEVEL(NODE_NUM), the ! level structure array pair containing the level structure found. ! ! Input, integer NODE_NUM, the number of nodes. ! implicit none integer adj_num integer node_num integer adj(adj_num) integer adj_row(node_num+1) integer iccsze integer j integer jstrt integer k integer kstop integer kstrt integer level(node_num) integer level_num integer level_num2 integer level_row(node_num+1) integer mask(node_num) integer mindeg integer nabor integer ndeg integer node integer root ! ! Determine the level structure rooted at ROOT. ! call level_set ( root, adj_num, adj_row, adj, mask, level_num, & level_row, level, node_num ) ! ! Count the number of nodes in this level structure. ! iccsze = level_row(level_num+1) - 1 ! ! Extreme case: ! A complete graph has a level set of only a single level. ! Every node is equally good (or bad). ! if ( level_num .eq. 1 ) then return end if ! ! Extreme case: ! A "line graph" 0--0--0--0--0 has every node in its only level. ! By chance, we've stumbled on the ideal root. ! if ( level_num .eq. iccsze ) then return end if ! ! Pick any node from the last level that has minimum degree ! as the starting point to generate a new level set. ! do mindeg = iccsze jstrt = level_row(level_num) root = level(jstrt) if ( jstrt .lt. iccsze ) then do j = jstrt, iccsze node = level(j) ndeg = 0 kstrt = adj_row(node) kstop = adj_row(node+1) - 1 do k = kstrt, kstop nabor = adj(k) if ( 0 .lt. mask(nabor) ) then ndeg = ndeg + 1 end if end do if ( ndeg .lt. mindeg ) then root = node mindeg = ndeg end if end do end if ! ! Generate the rooted level structure associated with this node. ! call level_set ( root, adj_num, adj_row, adj, mask, & level_num2, level_row, level, node_num ) ! ! If the number of levels did not increase, accept the new ROOT. ! if ( level_num2 .le. level_num ) then exit end if level_num = level_num2 ! ! In the unlikely case that ROOT is one endpoint of a line graph, ! we can exit now. ! if ( iccsze .le. level_num ) then exit end if end do return end subroutine level_set ( root, adj_num, adj_row, adj, mask, & level_num, level_row, level, node_num ) !*****************************************************************************80 ! !! LEVEL_SET generates the connected level structure rooted at a given node. ! ! Discussion: ! ! Only nodes for which MASK is nonzero will be considered. ! ! The root node chosen by the user is assigned level 1, and masked. ! All (unmasked) nodes reachable from a node in level 1 are ! assigned level 2 and masked. The process continues until there ! are no unmasked nodes adjacent to any node in the current level. ! The number of levels may vary between 2 and NODE_NUM. ! ! Modified: ! ! 28 October 2003 ! ! Author: ! ! Alan George, Joseph Liu ! FORTRAN90 version by John Burkardt ! ! Reference: ! ! Alan George, Joseph Liu, ! Computer Solution of Large Sparse Positive Definite Systems, ! Prentice Hall, 1981. ! ! Parameters: ! ! Input, integer ROOT, the node at which the level structure ! is to be rooted. ! ! Input, integer ADJ_NUM, the number of adjacency entries. ! ! Input, integer ADJ_ROW(NODE_NUM+1). Information about row I is stored ! in entries ADJ_ROW(I) through ADJ_ROW(I+1)-1 of ADJ. ! ! Input, integer ADJ(ADJ_NUM), the adjacency structure. ! For each row, it contains the column indices of the nonzero entries. ! ! Input/output, integer MASK(NODE_NUM). On input, only nodes with nonzero ! MASK are to be processed. On output, those nodes which were included ! in the level set have MASK set to 1. ! ! Output, integer LEVEL_NUM, the number of levels in the level ! structure. ROOT is in level 1. The neighbors of ROOT ! are in level 2, and so on. ! ! Output, integer LEVEL_ROW(NODE_NUM+1), LEVEL(NODE_NUM), the rooted ! level structure. ! ! Input, integer NODE_NUM, the number of nodes. ! implicit none integer adj_num integer node_num integer adj(adj_num) integer adj_row(node_num+1) integer i integer iccsze integer j integer jstop integer jstrt integer lbegin integer level_num integer level_row(node_num+1) integer level(node_num) integer lvlend integer lvsize integer mask(node_num) integer nbr integer node integer root mask(root) = 0 level(1) = root level_num = 0 lvlend = 0 iccsze = 1 ! ! LBEGIN is the pointer to the beginning of the current level, and ! LVLEND points to the end of this level. ! do lbegin = lvlend + 1 lvlend = iccsze level_num = level_num + 1 level_row(level_num) = lbegin ! ! Generate the next level by finding all the masked neighbors of nodes ! in the current level. ! do i = lbegin, lvlend node = level(i) jstrt = adj_row(node) jstop = adj_row(node+1) - 1 do j = jstrt, jstop nbr = adj(j) if ( mask(nbr) .ne. 0 ) then iccsze = iccsze + 1 level(iccsze) = nbr mask(nbr) = 0 end if end do end do ! ! Compute the current level width (the number of nodes encountered.) ! If it is positive, generate the next level. ! lvsize = iccsze - lvlend if ( lvsize .le. 0 ) then exit end if end do level_row(level_num+1) = lvlend + 1 ! ! Reset MASK to 1 for the nodes in the level structure. ! do i =1 ,iccsze mask(level(i)) = 1 enddo return end subroutine degree ( root, adj_num, adj_row, adj, mask, deg, & iccsze, ls, node_num ) !*****************************************************************************80 ! !! DEGREE computes the degrees of the nodes in the connected component. ! ! Discussion: ! ! The connected component is specified by MASK and ROOT. ! Nodes for which MASK is zero are ignored. ! ! Modified: ! ! 05 January 2003 ! ! Author: ! ! Alan George, Joseph Liu ! FORTRAN90 version by John Burkardt ! ! Reference: ! ! Alan George, Joseph Liu, ! Computer Solution of Large Sparse Positive Definite Systems, ! Prentice Hall, 1981. ! ! Parameters: ! ! Input, integer ROOT, the node that defines the connected component. ! ! Input, integer ADJ_NUM, the number of adjacency entries. ! ! Input, integer ADJ_ROW(NODE_NUM+1). Information about row I is stored ! in entries ADJ_ROW(I) through ADJ_ROW(I+1)-1 of ADJ. ! ! Input, integer ADJ(ADJ_NUM), the adjacency structure. ! For each row, it contains the column indices of the nonzero entries. ! ! Input, integer MASK(NODE_NUM), is nonzero for those nodes which are ! to be considered. ! ! Output, integer DEG(NODE_NUM), contains, for each node in the connected ! component, its degree. ! ! Output, integer ICCSIZE, the number of nodes in the connected component. ! ! Output, integer LS(NODE_NUM), stores in entries 1 through ICCSIZE the nodes ! in the connected component, starting with ROOT, and proceeding ! by levels. ! ! Input, integer NODE_NUM, the number of nodes. ! implicit none integer adj_num integer node_num integer adj(adj_num) integer adj_row(node_num+1) integer deg(node_num) integer i integer iccsze integer ideg integer j integer jstop integer jstrt integer lbegin integer ls(node_num) integer lvlend integer lvsize integer mask(node_num) integer nbr integer node integer root ! ! The sign of ADJ_ROW(I) is used to indicate if node I has been considered. ls(1) = root adj_row(root) = -adj_row(root) lvlend = 0 iccsze = 1 ! ! LBEGIN is the pointer to the beginning of the current level, and ! LVLEND points to the end of this level. do lbegin = lvlend + 1 lvlend = iccsze ! ! Find the degrees of nodes in the current level, ! and at the same time, generate the next level. do i = lbegin, lvlend node = ls(i) jstrt = -adj_row(node) jstop = abs ( adj_row(node+1) ) - 1 ideg = 0 do j = jstrt, jstop nbr = adj(j) if ( mask(nbr) .ne. 0 ) then ideg = ideg + 1 if ( 0 .le. adj_row(nbr) ) then adj_row(nbr) = -adj_row(nbr) iccsze = iccsze + 1 ls(iccsze) = nbr end if end if end do deg(node) = ideg end do ! ! Compute the current level width. lvsize = iccsze - lvlend ! ! If the current level width is nonzero, generate another level. if ( lvsize .eq. 0 ) then exit end if end do ! ! Reset ADJ_ROW to its correct sign and return. do i = 1, iccsze node = ls(i) adj_row(node) = -adj_row(node) end do return end RandomFieldsUtils/src/parallel_base.h0000644000176200001440000001231514227157055017402 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2021 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef parallel_omp_base_H #define parallel_omp_base_H 1 // NEVER_** is used by parallel_**, so no conflict with NO_, which is used // in the programmes. Except that there is no differences between // NEVER_ and NO_ // #define NEVER_OMP 1 // #define NEVER_AVX 1 // #define NEVER_SSE 1 #if defined WIN32 || defined _WIN32 || defined __WIN32__ #define MSDOS_WINDOWS 1 #elif defined MSDOS_WINDOWS #undef MSDOS_WINDOWS #endif #if defined __x86_64 || defined __x86_64__ || defined __amd64__ || defined __amd64 || defined _M_X64 #define X86_64 1 #elif defined X86_64 #undef X86_64 #endif #if defined __arm64__ || defined __arm64 || defined __aarch64__ #define ARM64 1 #elif defined ARM64 #undef ARM64 #endif #if defined __arm32__ || defined __arm__ || defined ARM64 #define ARM32 1 #define NO_AVX 1 #endif #if defined __ARM_NEON__ || defined __aarch64__ || defined _M_ARM || defined _M_ARM64 #define NEON 1 #elif defined NEON #undef NEON #endif #if defined MSDOS_WINDOWS || defined (__APPLE__) || defined(__sun) #if defined TIME_AVAILABLE #undef TIME_AVAILABLE #endif #else #define TIME_AVAILABLE 1 #endif #if defined _OPENMP && ! defined NO_OMP && ! defined NEVER_OMP && ! defined ARM32 && ! defined __APPLE__ // 15 Jan 2022 #if defined SCHLATHERS_MACHINE #define DO_PARALLEL 1 // may change value when debugging #else #define DO_PARALLEL 1// never changes value #endif #elif defined DO_PARALLEL #undef DO_PARALLEL #endif #if defined NEVER_SSE #ifndef NO_SSE2 #define NO_SSE2 1 #endif #elif defined NEVER_AVX #ifndef NO_AVX #define NO_AVX 1 #endif #elif defined NEVER_AVX512 #ifndef NO_AVX512 #define NO_AVX512 1 #endif #endif #if defined NO_SSE2 && ! defined NO_SSE3 #define NO_SSE3 1 #endif #if defined NO_SSE3 && ! defined NO_SSSE3 #define NO_SSSE3 1 #endif #if defined NO_SSSE3 && ! defined NO_SSE41 #define NO_SSE41 1 #endif #if defined NO_SSE41 && ! defined NO_AVX #define NO_AVX 1 #endif #if defined NO_AVX && ! defined NO_AVX2 #define NO_AVX2 1 #endif #if defined NO_AVX2 && ! defined NO_AVX512 #define NO_AVX512 1 #endif #if ! defined NO_AVX512 #if ! defined DO_AVX512BITALG && ! defined DO_AVX512BW && ! defined DO_AVX512CD && ! defined DO_AVX512DQ && ! defined DO_AVX512ER && ! defined DO_AVX512F && ! defined DO_AVX512IFMA && ! defined DO_AVX512PF && ! defined DO_AVX512VBMI && ! defined DO_AVX512VL && ! defined DO_AVX512VPOPCNTDQ && ! defined DO_AVX5124FMAPS && ! defined DO_AVX5124VNNIW #define DO_AVX512BITALG 1 #define DO_AVX512BW 1 #define DO_AVX512CD 1 #define DO_AVX512DQ 1 #define DO_AVX512ER 1 #define DO_AVX512F 1 #define DO_AVX512IFMA 1 #define DO_AVX512PF 1 #define DO_AVX512VBMI 1 #define DO_AVX512VL 1 #define DO_AVX512VPOPCNTDQ 1 #define DO_AVX5124FMAPS 1 #define DO_AVX5124VNNIW 1 #endif #if defined __AVX512BITALG__ && defined DO_AVX512BITALG #define AVX512BITALG 1 #endif #if defined __AVX512BW__ && defined DO_AVX512BW #define AVX512BW 1 #endif #if defined __AVX512CD__ && defined DO_AVX512CD #define AVX512CD 1 #endif #if defined __AVX512DQ__ && defined DO_AVX512DQ #define AVX512DQ 1 #endif #if defined __AVX512ER__ && defined DO_AVX512ER #define AVX512ER 1 #endif #if defined __AVX512F__ && defined DO_AVX512F #define AVX512F 1 #define AVX512 1 #endif #if defined __AVX512IFMA__ && defined DO_AVX512IFMA #define AVX512IFMA 1 #endif #if defined __AVX512PF__ && defined DO_AVX512PF #define AVX512PF 1 #endif #if defined __AVX512VBMI__ && defined DO_AVX512VBMI #define AVX512VBMI 1 #endif #if defined __AVX512VL__ && defined DO_AVX512VL #define AVX512VL 1 // #endif #if defined __AVX512VPOPCNTDQ__ && defined DO_AVX512VPOPCNTDQ #define AVX512VPOPCNTDQ 1 // #endif #if defined __AVX5124FMAPS__ && defined DO_AVX5124FMAPS #define AVX5124FMAPS 1 #endif #if defined __AVX5124VNNIW__ && defined DO_AVX5124VNNIW #define AVX5124VNNIW 1 #endif #endif // end ! no 512 #if defined __AVX2__ && ! defined NO_AVX2 #define AVX2 1 #elif defined AVX2 #undef AVX2 #endif #if defined __AVX__ && ! defined NO_AVX #define AVX 1 #elif defined AVX #undef AVX #endif #if (defined __SSE41__ || defined NEON) && ! defined NO_SSE41 #define SSE41 1 #elif defined SSE41 #undef SSE41 #endif #if (defined __SSSE3__ || defined NEON) && ! defined NO_SSSE3 #define SSSE3 1 #elif defined SSSE3 #undef SSSE3 #endif #if (defined __SSE3__ || defined NEON) && ! defined NO_SSE3 #define SSE3 1 #elif defined SSE3 #undef SSE3 #endif #if (defined __SSE2__ || defined NEON) && ! defined NO_SSE2 #define SSE2 1 #elif defined SSE2 #undef SSE2 #endif #endif RandomFieldsUtils/src/Makevars.in0000644000176200001440000000236314227157055016546 0ustar liggesusers# Paths of installation NVCC=@MY_CUDA_HOME@/bin/nvcc # Target architectures # CUTLASS_NVCC_ARCHS is not needed for <75 as no cutlass is involved cu61= 61 70 75 80 86 cu61_flags=$(foreach i,$(cu61),--generate-code arch=compute_$(i),code=sm_$(i)) cu61_nums=$(subst $() $(),;,$(cu61)) cu75= 75 80 86 cu75_nums=$(subst $() $(),;,$(cu75)) cu75_flags=$(foreach i,$(cu75),--generate-code arch=compute_$(i),code=sm_$(i)) -DCUTLASS_NVCC_ARCHS='$(cu75_nums)' PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) $(SHLIB_OPENMP_CXXFLAGS) \ @MY_LIB_FLAGS@ @MY_CUDA_LIBS@ PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) @MY_PKG_FLAGS@ @MY_SSE2@ @MY_SSE3@ @MY_SSSE3@ @MY_SSE41@ @MY_AVX@ @MY_AVX2@ @MY_AVX512F@ all_cu_flags=-std=c++14 -Xcompiler -fopenmp -DSCHLATHERS_MACHINE \ --expt-extended-lambda $(CLINK_CPPFLAGS) -I $(R_INCLUDE_DIR) \ -I@MY_CUDA_HOME@/include -I./cutlass \ -Xcompiler -fpic -DUSEGPU -g -x cu # Details of CUDA compilation %61.o: %61.cu $(NVCC) $(cu61_flags) $(all_cu_flags) -c $< -o $@ %75.o: %75.cu $(NVCC) $(cu75_flags) $(all_cu_flags) -c $< -o $@ # Final compilation OBJECTS = @MY_CU_FILES@ @MY_C_FILES@ all: $(SHLIB) .PHONY: all #all: $(SHLIB) clean # .PHONY: all clean # Clean-up rules #clean: ## rm -r -f *.o Makevars # rm -f Makevars RandomFieldsUtils/src/sse2neon.h0000644000176200001440000123307014227157055016354 0ustar liggesusers#ifndef SSE2NEON_H #define SSE2NEON_H #include #include #include #include #if ! defined MALLOCX #define SQRT(X) std::sqrt((double) X) // OK #define FLOOR std::floor #define CEIL(X) std::ceil((double) X) // OK; keine Klammern um X! #define MALLOCX std::malloc #define FREEX std::free #define MALLOC MALLOCX #define FREE(X) if ((X) == NULL) {} else {FREEX(X); (X)=NULL;} #endif #define BUG_SSE2NEON error("Severe error occured in sse2neon. Please contact schlather@math.uni-mannheim.de") #define ZERO_SSE2NEON _mm_setzero_si128 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // !!!!!!!!!!!!! NOTE THAT THIS FILE HAS BEEN MODIFIED !!!!!!!!!!!!!!! // !!!!!!!!!!!!! AND SHOULD NOT BE DISTRIBUTED OUTSIDE !!!!!!!!!!!!!!! // !!!!!!!!!!!!! THIS R PACKAGE !!!!!!!!!!!!!!! // !!!!!!!!!!!!! 13 Jan 2022, Martin Schlather !!!!!!!!!!!!!!! // !!!!!!!!!!!!! 27 Mar 2022, Martin Schlather !!!!!!!!!!!!!!! // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // This header file provides a simple API translation layer // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions // // This header file does not yet translate all of the SSE intrinsics. // // Contributors to this work are: // John W. Ratcliff // Brandon Rowlett // Ken Fast // Eric van Beurden // Alexander Potylitsin // Hasindu Gamaarachchi // Jim Huang // Mark Cheng // Malcolm James MacLeod // Devin Hussey (easyaspi314) // Sebastian Pop // Developer Ecosystem Engineering // Danila Kutenin // François Turban (JishinMaster) // Pei-Hsuan Hung // Yang-Hao Yuan // Syoyo Fujita // Brecht Van Lommel /* * sse2neon is freely redistributable under the MIT License. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* Tunable configurations */ /* Enable precise implementation of math operations * This would slow down the computation a bit, but gives consistent result with * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result) */ /* _mm_min|max_ps|ss|pd|sd */ #ifndef SSE2NEON_PRECISE_MINMAX #define SSE2NEON_PRECISE_MINMAX (0) #endif /* _mm_rcp_ps and _mm_div_ps */ #ifndef SSE2NEON_PRECISE_DIV #define SSE2NEON_PRECISE_DIV (0) #endif /* _mm_sqrt_ps and _mm_rsqrt_ps */ #ifndef SSE2NEON_PRECISE_SQRT #define SSE2NEON_PRECISE_SQRT (0) #endif /* _mm_dp_pd */ #ifndef SSE2NEON_PRECISE_DP #define SSE2NEON_PRECISE_DP (0) #endif /* compiler specific definitions */ #if defined(__GNUC__) || defined(__clang__) #pragma push_macro("FORCE_INLINE") #pragma push_macro("ALIGN_STRUCT") #define FORCE_INLINE static inline __attribute__((always_inline)) #define ALIGN_STRUCT(x) __attribute__((aligned(x))) #define _sse2neon_likely(x) __builtin_expect(!!(x), 1) #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) #else /* non-GNU / non-clang compilers */ #warning "Macro name collisions may happen with unsupported compiler." #ifndef FORCE_INLINE #define FORCE_INLINE static inline #endif #ifndef ALIGN_STRUCT #define ALIGN_STRUCT(x) __declspec(align(x)) #endif #define _sse2neon_likely(x) (x) #define _sse2neon_unlikely(x) (x) #endif //#include //#include /* Architecture-specific build options */ /* FIXME: #pragma GCC push_options is only available on GCC */ #if defined(__GNUC__) #if defined(__arm__) && __ARM_ARCH == 7 /* According to ARM C Language Extensions Architecture specification, * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) * architecture supported. */ #if !defined(__ARM_NEON) || !defined(__ARM_NEON__) #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." #endif #if !defined(__clang__) #pragma GCC push_options #pragma GCC target("fpu=neon") #endif #elif defined(__aarch64__) #if !defined(__clang__) #pragma GCC push_options #pragma GCC target("+simd") #endif #else #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." #endif #endif #include /* Rounding functions require either Aarch64 instructions or libm failback */ #if !defined(__aarch64__) #include #endif /* "__has_builtin" can be used to query support for built-in functions * provided by gcc/clang and other compilers that support it. */ #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ /* Compatibility with gcc <= 9 */ #if defined(__GNUC__) && (__GNUC__ <= 9) #define __has_builtin(x) HAS##x #define HAS__builtin_popcount 1 #define HAS__builtin_popcountll 1 #else #define __has_builtin(x) 0 #endif #endif /** * MACRO for shuffle parameter for _mm_shuffle_ps(). * Argument fp3 is a digit[0123] that represents the fp from argument "b" * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same * for fp2 in result. fp1 is a digit[0123] that represents the fp from * argument "a" of mm_shuffle_ps that will be places in fp1 of result. * fp0 is the same for fp0 of result. */ #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) /* Rounding mode macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 #define _MM_FROUND_TO_POS_INF 0x02 #define _MM_FROUND_TO_ZERO 0x03 #define _MM_FROUND_CUR_DIRECTION 0x04 #define _MM_FROUND_NO_EXC 0x08 #define _MM_FROUND_RAISE_EXC 0x00 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) #define _MM_ROUND_NEAREST 0x0000 #define _MM_ROUND_DOWN 0x2000 #define _MM_ROUND_UP 0x4000 #define _MM_ROUND_TOWARD_ZERO 0x6000 /* Flush zero mode macros. */ #define _MM_FLUSH_ZERO_MASK 0x8000 #define _MM_FLUSH_ZERO_ON 0x8000 #define _MM_FLUSH_ZERO_OFF 0x0000 /* Denormals are zeros mode macros. */ #define _MM_DENORMALS_ZERO_MASK 0x0040 #define _MM_DENORMALS_ZERO_ON 0x0040 #define _MM_DENORMALS_ZERO_OFF 0x0000 /* indicate immediate constant argument in a given range */ #define __constrange(a, b) const /* A few intrinsics accept traditional data types like ints or floats, but * most operate on data types that are specific to SSE. * If a vector type ends in d, it contains doubles, and if it does not have * a suffix, it contains floats. An integer vector type can contain any type * of integer, from chars to shorts to unsigned long longs. */ typedef int64x1_t __m64; typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ // On ARM 32-bit architecture, the float64x2_t is not supported. // The data type __m128d should be represented in a different way for related // intrinsic conversion. #if defined(__aarch64__) typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ #else typedef float32x4_t __m128d; #endif typedef int64x2_t __m128i; /* 128-bit vector containing integers */ // __int64 is defined in the Intrinsics Guide which maps to different datatype // in different data model #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) #if (defined(__x86_64__) || defined(__i386__)) #define __int64 long long #else #define __int64 int64_t #endif #endif /* type-safe casting between types */ #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) #define vreinterpretq_m128_f32(x) (x) #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) #define vreinterpretq_f32_m128(x) (x) #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) #define vreinterpretq_m128i_s64(x) (x) #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) #define vreinterpretq_s64_m128i(x) (x) #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) #define vreinterpret_m64_s64(x) (x) #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) #define vreinterpret_s64_m64(x) (x) #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) #if defined(__aarch64__) #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) #define vreinterpretq_m128d_f64(x) (x) #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) #define vreinterpretq_f64_m128d(x) (x) #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) #else #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) #define vreinterpretq_m128d_f32(x) (x) #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) #define vreinterpretq_f32_m128d(x) (x) #endif // A struct is defined in this header file called 'SIMDVec' which can be used // by applications which attempt to access the contents of an __m128 struct // directly. It is important to note that accessing the __m128 struct directly // is bad coding practice by Microsoft: @see: // https://docs.microsoft.com/en-us/cpp/cpp/m128 // // However, some legacy source code may try to access the contents of an __m128 // struct directly so the developer can use the SIMDVec as an alias for it. Any // casting must be done manually by the developer, as you cannot cast or // otherwise alias the base NEON data type for intrinsic operations. // // union intended to allow direct access to an __m128 variable using the names // that the MSVC compiler provides. This union should really only be used when // trying to access the members of the vector as integer values. GCC/clang // allow native access to the float members through a simple array access // operator (in C since 4.6, in C++ since 4.8). // // Ideally direct accesses to SIMD vectors should not be used since it can cause // a performance hit. If it really is needed however, the original __m128 // variable can be aliased with a pointer to this union and used to access // individual components. The use of this union should be hidden behind a macro // that is used throughout the codebase to access the members instead of always // declaring this type of variable. typedef union ALIGN_STRUCT(16) SIMDVec { float m128_f32[4]; // as floats - DON'T USE. Added for convenience. int8_t m128_i8[16]; // as signed 8-bit integers. int16_t m128_i16[8]; // as signed 16-bit integers. int32_t m128_i32[4]; // as signed 32-bit integers. int64_t m128_i64[2]; // as signed 64-bit integers. uint8_t m128_u8[16]; // as unsigned 8-bit integers. uint16_t m128_u16[8]; // as unsigned 16-bit integers. uint32_t m128_u32[4]; // as unsigned 32-bit integers. uint64_t m128_u64[2]; // as unsigned 64-bit integers. } SIMDVec; // casting using SIMDVec #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) /* SSE macros */ #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode // Function declaration // SSE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(); FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); FORCE_INLINE __m128 _mm_set_ps1(float); FORCE_INLINE __m128 _mm_setzero_ps(void); // SSE2 FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i); FORCE_INLINE __m128i _mm_castps_si128(__m128); FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i); FORCE_INLINE __m128i _mm_cvtps_epi32(__m128); FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d); FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i); FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int); FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t); FORCE_INLINE __m128d _mm_set_pd(double, double); FORCE_INLINE __m128i _mm_set1_epi32(int); FORCE_INLINE __m128i _mm_setzero_si128(); // SSE4.1 FORCE_INLINE __m128d _mm_ceil_pd(__m128d); FORCE_INLINE __m128 _mm_ceil_ps(__m128); FORCE_INLINE __m128d _mm_floor_pd(__m128d); FORCE_INLINE __m128 _mm_floor_ps(__m128); FORCE_INLINE __m128d _mm_round_pd(__m128d, int); FORCE_INLINE __m128 _mm_round_ps(__m128, int); // SSE4.2 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); /* Backwards compatibility for compilers with lack of specific type support */ // Older gcc does not define vld1q_u8_x4 type #if defined(__GNUC__) && !defined(__clang__) && \ ((__GNUC__ <= 10 && defined(__arm__)) || \ (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ (__GNUC__ <= 9 && defined(__aarch64__))) FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { uint8x16x4_t ret; ret.val[0] = vld1q_u8(p + 0); ret.val[1] = vld1q_u8(p + 16); ret.val[2] = vld1q_u8(p + 32); ret.val[3] = vld1q_u8(p + 48); return ret; } #else // Wraps vld1q_u8_x4 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { return vld1q_u8_x4(p); } #endif /* Function Naming Conventions * The naming convention of SSE intrinsics is straightforward. A generic SSE * intrinsic function is given as follows: * _mm__ * * The parts of this format are given as follows: * 1. describes the operation performed by the intrinsic * 2. identifies the data type of the function's primary arguments * * This last part, , is a little complicated. It identifies the * content of the input values, and can be set to any of the following values: * + ps - vectors contain floats (ps stands for packed single-precision) * + pd - vectors cantain doubles (pd stands for packed double-precision) * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit * signed integers * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit * unsigned integers * + si128 - unspecified 128-bit vector or 256-bit vector * + m128/m128i/m128d - identifies input vector types when they are different * than the type of the returned vector * * For example, _mm_setzero_ps. The _mm implies that the function returns * a 128-bit vector. The _ps at the end implies that the argument vectors * contain floats. * * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); * // Set packed 8-bit integers * // 128 bits, 16 chars, per 8 bits * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, * 4, 5, 12, 13, 6, 7, 14, 15); * // Shuffle packed 8-bit integers * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb * * Data (Number, Binary, Byte Index): +------+------+-------------+------+------+-------------+ | 1 | 2 | 3 | 4 | Number +------+------+------+------+------+------+------+------+ | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary +------+------+------+------+------+------+------+------+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index +------+------+------+------+------+------+------+------+ +------+------+------+------+------+------+------+------+ | 5 | 6 | 7 | 8 | Number +------+------+------+------+------+------+------+------+ | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary +------+------+------+------+------+------+------+------+ | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index +------+------+------+------+------+------+------+------+ * Index (Byte Index): +------+------+------+------+------+------+------+------+ | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | +------+------+------+------+------+------+------+------+ +------+------+------+------+------+------+------+------+ | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | +------+------+------+------+------+------+------+------+ * Result: +------+------+------+------+------+------+------+------+ | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index +------+------+------+------+------+------+------+------+ | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary +------+------+------+------+------+------+------+------+ | 256 | 2 | 5 | 6 | Number +------+------+------+------+------+------+------+------+ +------+------+------+------+------+------+------+------+ | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index +------+------+------+------+------+------+------+------+ | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary +------+------+------+------+------+------+------+------+ | 3 | 7 | 4 | 8 | Number +------+------+------+------+------+------+-------------+ */ /* Constants for use with _mm_prefetch. */ enum _mm_hint { _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ _MM_HINT_T1 = 2, /* load data to L2 cache only */ _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */ _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */ _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */ _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */ }; // The bit field mapping to the FPCR(floating-point control register) typedef struct { uint16_t res0; uint8_t res1 : 6; uint8_t bit22 : 1; uint8_t bit23 : 1; uint8_t bit24 : 1; uint8_t res2 : 7; #if defined(__aarch64__) uint32_t res3; #endif } fpcr_bitfield; // Takes the upper 64 bits of a and places it in the low end of the result // Takes the lower 64 bits of b and places it into the high end of the result. FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); } // takes the lower two 32-bit values from a and swaps them and places in high // end of result takes the higher two 32 bit values from b and swaps them and // places in low end of result. FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); } FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) { float32x2_t a21 = vget_high_f32( vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); float32x2_t b03 = vget_low_f32( vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); } FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) { float32x2_t a03 = vget_low_f32( vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); float32x2_t b21 = vget_high_f32( vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); } FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); } FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); } FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); } // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the // high FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); } FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) { float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); } FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) { float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); } FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) { float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t b22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); } FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) { float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); } FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) { float32x2_t a33 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); } FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); } FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32_t b2 = vgetq_lane_f32(b, 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); } FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32_t b2 = vgetq_lane_f32(b, 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); } // Kahan summation for accurate summation of floating-point numbers. // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y) { y -= *c; float t = *sum + y; *c = (t - *sum) - y; *sum = t; } #if defined(__ARM_FEATURE_CRYPTO) // Wraps vmull_p64 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); return vreinterpretq_u64_p128(vmull_p64(a, b)); } #else // ARMv7 polyfill // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. // // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a // 64-bit->128-bit polynomial multiply. // // It needs some work and is somewhat slow, but it is still faster than all // known scalar methods. // // Algorithm adapted to C from // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted // from "Fast Software Polynomial Multiplication on ARM Processors Using the // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab // (https://hal.inria.fr/hal-01506572) static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly8x8_t a = vreinterpret_p8_u64(_a); poly8x8_t b = vreinterpret_p8_u64(_b); // Masks uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff)); uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000)); // Do the multiplies, rotating with vext to get all combinations uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 // Add cross products uint8x16_t l = veorq_u8(e, f); // L = E + F uint8x16_t m = veorq_u8(g, h); // M = G + H uint8x16_t n = veorq_u8(i, j); // N = I + J // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL // instructions. #if defined(__aarch64__) uint8x16_t lm_p0 = vreinterpretq_u8_u64( vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); uint8x16_t lm_p1 = vreinterpretq_u8_u64( vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); uint8x16_t nk_p0 = vreinterpretq_u8_u64( vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); uint8x16_t nk_p1 = vreinterpretq_u8_u64( vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); #else uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); #endif // t0 = (L) (P0 + P1) << 8 // t1 = (M) (P2 + P3) << 16 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); // t2 = (N) (P4 + P5) << 24 // t3 = (K) (P6 + P7) << 32 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); // De-interleave #if defined(__aarch64__) uint8x16_t t0 = vreinterpretq_u8_u64( vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); uint8x16_t t1 = vreinterpretq_u8_u64( vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); uint8x16_t t2 = vreinterpretq_u8_u64( vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); uint8x16_t t3 = vreinterpretq_u8_u64( vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); #else uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); #endif // Shift the cross products uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 // Accumulate the products uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); uint8x16_t mix = veorq_u8(d, cross1); uint8x16_t r = veorq_u8(mix, cross2); return vreinterpretq_u64_u8(r); } #endif // ARMv7 polyfill // C equivalent: // __m128i _mm_shuffle_epi32_default(__m128i a, // __constrange(0, 255) int imm) { // __m128i ret; // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; // return ret; // } #define _mm_shuffle_epi32_default(a, imm) \ __extension__({ \ int32x4_t ret; \ ret = vmovq_n_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ ret = vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ ret, 1); \ ret = vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ ret, 2); \ ret = vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ ret, 3); \ vreinterpretq_m128i_s32(ret); \ }) // Takes the upper 64 bits of a and places it in the low end of the result // Takes the lower 64 bits of a and places it into the high end of the result. FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) { int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); } // takes the lower two 32-bit values from a and swaps them and places in low end // of result takes the higher two 32 bit values from a and swaps them and places // in high end of result. FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); } // rotates the least significant 32 bits into the most significant 32 bits, and // shifts the rest down FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) { return vreinterpretq_m128i_s32( vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); } // rotates the most significant 32 bits into the least significant 32 bits, and // shifts the rest up FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) { return vreinterpretq_m128i_s32( vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); } // gets the lower 64 bits of a, and places it in the upper 64 bits // gets the lower 64 bits of a and places it in the lower 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) { int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); } // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); } // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and // places it in the lower 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); } FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) { int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); } FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) { int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); } FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) { int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); } // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) // int imm) #if defined(__aarch64__) #define _mm_shuffle_epi32_splat(a, imm) \ __extension__({ \ vreinterpretq_m128i_s32( \ vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ }) #else #define _mm_shuffle_epi32_splat(a, imm) \ __extension__({ \ vreinterpretq_m128i_s32( \ vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ }) #endif // NEON does not support a general purpose permute intrinsic // Selects four specific single-precision, floating-point values from a and b, // based on the mask i. // // C equivalent: // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, // __constrange(0, 255) int imm) { // __m128 ret; // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; // return ret; // } // // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx #define _mm_shuffle_ps_default(a, b, imm) \ __extension__({ \ float32x4_t ret; \ ret = vmovq_n_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ ret = vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ ret, 1); \ ret = vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ ret, 2); \ ret = vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ ret, 3); \ vreinterpretq_m128_f32(ret); \ }) // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified // by imm. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, // __constrange(0,255) int // imm) #define _mm_shufflelo_epi16_function(a, imm) \ __extension__({ \ int16x8_t ret = vreinterpretq_s16_m128i(a); \ int16x4_t lowBits = vget_low_s16(ret); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ 1); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ 2); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ 3); \ vreinterpretq_m128i_s16(ret); \ }) // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified // by imm. // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, // __constrange(0,255) int // imm) #define _mm_shufflehi_epi16_function(a, imm) \ __extension__({ \ int16x8_t ret = vreinterpretq_s16_m128i(a); \ int16x4_t highBits = vget_high_s16(ret); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ 5); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ 6); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ 7); \ vreinterpretq_m128i_s16(ret); \ }) /* MMX */ //_mm_empty is a no-op on arm FORCE_INLINE void _mm_empty(void) {} /* SSE */ // Adds the four single-precision, floating-point values of a and b. // // r0 := a0 + b0 // r1 := a1 + b1 // r2 := a2 + b2 // r3 := a3 + b3 // // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // adds the scalar single-precision floating point values of a and b. // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) { float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); // the upper values in the result must be the remnants of . return vreinterpretq_m128_f32(vaddq_f32(a, value)); } // Computes the bitwise AND of the four single-precision, floating-point values // of a and b. // // r0 := a0 & b0 // r1 := a1 & b1 // r2 := a2 & b2 // r3 := a3 & b3 // // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } // Computes the bitwise AND-NOT of the four single-precision, floating-point // values of a and b. // // r0 := ~a0 & b0 // r1 := ~a1 & b1 // r2 := ~a2 & b2 // r3 := ~a3 & b3 // // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vbicq_s32(vreinterpretq_s32_m128(b), vreinterpretq_s32_m128(a))); // *NOTE* argument swap } // Average packed unsigned 16-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) { return vreinterpret_m64_u16( vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); } // Average packed unsigned 8-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Compares for equality. // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for equality. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); } // Compares for greater than or equal. // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for greater than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpge_ps(a, b)); } // Compares for greater than. // // r0 := (a0 > b0) ? 0xffffffff : 0x0 // r1 := (a1 > b1) ? 0xffffffff : 0x0 // r2 := (a2 > b2) ? 0xffffffff : 0x0 // r3 := (a3 > b3) ? 0xffffffff : 0x0 // // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for greater than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); } // Compares for less than or equal. // // r0 := (a0 <= b0) ? 0xffffffff : 0x0 // r1 := (a1 <= b1) ? 0xffffffff : 0x0 // r2 := (a2 <= b2) ? 0xffffffff : 0x0 // r3 := (a3 <= b3) ? 0xffffffff : 0x0 // // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for less than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmple_ps(a, b)); } // Compares for less than // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for less than // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmplt_ps(a, b)); } // Compares for inequality. // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for inequality. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); } // Compares for not greater than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for not greater than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpnge_ps(a, b)); } // Compares for not greater than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for not greater than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpngt_ps(a, b)); } // Compares for not less than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for not less than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpnle_ps(a, b)); } // Compares for not less than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for not less than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpnlt_ps(a, b)); } // Compares the four 32-bit floats in a and b to check if any values are NaN. // Ordered compare between each value returns true for "orderable" and false for // "not orderable" (NaN). // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see // also: // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) { // Note: NEON does not have ordered compare builtin // Need to compare a eq a and b eq b to check for NaN // Do AND of results to get final uint32x4_t ceqaa = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t ceqbb = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); } // Compares for ordered. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpord_ps(a, b)); } // Compares for unordered. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) { uint32x4_t f32a = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t f32b = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); } // Compares for unordered. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); } // Compares the lower single-precision floating point scalar values of a and b // using an equality operation. : // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) { uint32x4_t a_eq_b = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_eq_b, 0) & 0x1; } // Compares the lower single-precision floating point scalar values of a and b // using a greater than or equal operation. : // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) { uint32x4_t a_ge_b = vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_ge_b, 0) & 0x1; } // Compares the lower single-precision floating point scalar values of a and b // using a greater than operation. : // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) { uint32x4_t a_gt_b = vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_gt_b, 0) & 0x1; } // Compares the lower single-precision floating point scalar values of a and b // using a less than or equal operation. : // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) { uint32x4_t a_le_b = vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_le_b, 0) & 0x1; } // Compares the lower single-precision floating point scalar values of a and b // using a less than operation. : // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important // note!! The documentation on MSDN is incorrect! If either of the values is a // NAN the docs say you will get a one, but in fact, it will return a zero!! FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) { uint32x4_t a_lt_b = vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_lt_b, 0) & 0x1; } // Compares the lower single-precision floating point scalar values of a and b // using an inequality operation. : // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) { return !_mm_comieq_ss(a, b); } // Convert packed signed 32-bit integers in b to packed single-precision // (32-bit) floating-point elements, store the results in the lower 2 elements // of dst, and copy the upper 2 packed elements from a to the upper elements of // dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[63:32] := Convert_Int32_To_FP32(b[63:32]) // dst[95:64] := a[95:64] // dst[127:96] := a[127:96] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) { return vreinterpretq_m128_f32( vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), vget_high_f32(vreinterpretq_f32_m128(a)))); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) { #if defined(__aarch64__) return vreinterpret_m64_s32( vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))))); #else return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION))))); #endif } // Convert the signed 32-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) { return vreinterpretq_m128_f32( vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si FORCE_INLINE int _mm_cvt_ss2si(__m128 a) { #if defined(__aarch64__) return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))), 0); #else float32_t data = vgetq_lane_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); return (int32_t) data; #endif } // Convert packed 16-bit integers in a to packed single-precision (32-bit) // floating-point elements, and store the results in dst. // // FOR j := 0 to 3 // i := j*16 // m := j*32 // dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) { return vreinterpretq_m128_f32( vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); } // Convert packed 32-bit integers in b to packed single-precision (32-bit) // floating-point elements, store the results in the lower 2 elements of dst, // and copy the upper 2 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[63:32] := Convert_Int32_To_FP32(b[63:32]) // dst[95:64] := a[95:64] // dst[127:96] := a[127:96] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) { return vreinterpretq_m128_f32( vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), vget_high_f32(vreinterpretq_f32_m128(a)))); } // Convert packed signed 32-bit integers in a to packed single-precision // (32-bit) floating-point elements, store the results in the lower 2 elements // of dst, then convert the packed signed 32-bit integers in b to // single-precision (32-bit) floating-point element, and store the results in // the upper 2 elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(a[31:0]) // dst[63:32] := Convert_Int32_To_FP32(a[63:32]) // dst[95:64] := Convert_Int32_To_FP32(b[31:0]) // dst[127:96] := Convert_Int32_To_FP32(b[63:32]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) { return vreinterpretq_m128_f32(vcvtq_f32_s32( vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); } // Convert the lower packed 8-bit integers in a to packed single-precision // (32-bit) floating-point elements, and store the results in dst. // // FOR j := 0 to 3 // i := j*8 // m := j*32 // dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) { return vreinterpretq_m128_f32(vcvtq_f32_s32( vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 16-bit integers, and store the results in dst. Note: this intrinsic // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and // 0x7FFFFFFF. // // FOR j := 0 to 3 // i := 16*j // k := 32*j // IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF) // dst[i+15:i] := 0x7FFF // ELSE // dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) { const __m128 i16Min = _mm_set_ps1((float) INT16_MIN); const __m128 i16Max = _mm_set_ps1((float) INT16_MAX); const __m128 i32Max = _mm_set_ps1((float) INT32_MAX); const __m128i maxMask = _mm_castps_si128( _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max))); const __m128i betweenMask = _mm_castps_si128( _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max))); const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask), _mm_setzero_si128()); __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX)); __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN)); __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a)); __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt); return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32))); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) // Convert packed single-precision (32-bit) floating-point elements in a to // packed 8-bit integers, and store the results in lower 4 elements of dst. // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values // between 0x7F and 0x7FFFFFFF. // // FOR j := 0 to 3 // i := 8*j // k := 32*j // IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF) // dst[i+7:i] := 0x7F // ELSE // dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8 FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a) { const __m128 i8Min = _mm_set_ps1((float) INT8_MIN); const __m128 i8Max = _mm_set_ps1((float) INT8_MAX); const __m128 i32Max = _mm_set_ps1((float) INT32_MAX); const __m128i maxMask = _mm_castps_si128( _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max))); const __m128i betweenMask = _mm_castps_si128( _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max))); const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask), _mm_setzero_si128()); __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX)); __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN)); __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a)); __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt); int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32)); int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16)); static const uint32_t bitMask[2] = {0xFFFFFFFF, 0}; int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask)); return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0))); } // Convert packed unsigned 16-bit integers in a to packed single-precision // (32-bit) floating-point elements, and store the results in dst. // // FOR j := 0 to 3 // i := j*16 // m := j*32 // dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) { return vreinterpretq_m128_f32( vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); } // Convert the lower packed unsigned 8-bit integers in a to packed // single-precision (32-bit) floating-point elements, and store the results in // dst. // // FOR j := 0 to 3 // i := j*8 // m := j*32 // dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) { return vreinterpretq_m128_f32(vcvtq_f32_u32( vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); } // Convert the signed 32-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) // Convert the signed 64-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int64_To_FP32(b[63:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) { return vreinterpretq_m128_f32( vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); } // Copy the lower single-precision (32-bit) floating-point element of a to dst. // // dst[31:0] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 FORCE_INLINE float _mm_cvtss_f32(__m128 a) { return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // // dst[31:0] := Convert_FP32_To_Int32(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 64-bit integer, and store the result in dst. // // dst[63:0] := Convert_FP32_To_Int64(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) { #if defined(__aarch64__) return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0); #else float32_t data = vgetq_lane_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); return (int64_t) data; #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) { return vreinterpret_m64_s32( vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) { return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) { return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); } // Divides the four single-precision, floating-point values of a and b. // // r0 := a0 / b0 // r1 := a1 / b1 // r2 := a2 / b2 // r3 := a3 / b3 // // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) { #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV return vreinterpretq_m128_f32( vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); #if SSE2NEON_PRECISE_DIV // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); #endif return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); #endif } // Divides the scalar single-precision floating point value of a by b. // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Extract a 16-bit integer from a, selected with imm8, and store the result in // the lower element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16 #define _mm_extract_pi16(a, imm) \ (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) // Free aligned memory that was allocated with _mm_malloc. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free FORCE_INLINE void _mm_free(void *addr) { FREE(addr); } // Macro: Get the flush zero bits from the MXCSR control and status register. // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or // _MM_FLUSH_ZERO_OFF // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode() { union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF; } // Macro: Get the rounding mode bits from the MXCSR control and status register. // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE() { union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif if (r.field.bit22) { return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; } else { return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; } } // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16 #define _mm_insert_pi16(a, b, imm) \ __extension__({ \ vreinterpret_m64_s16( \ vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ }) // Loads four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx FORCE_INLINE __m128 _mm_load_ps(const float *p) { return vreinterpretq_m128_f32(vld1q_f32(p)); } // Load a single-precision (32-bit) floating-point element from memory into all // elements of dst. // // dst[31:0] := MEM[mem_addr+31:mem_addr] // dst[63:32] := MEM[mem_addr+31:mem_addr] // dst[95:64] := MEM[mem_addr+31:mem_addr] // dst[127:96] := MEM[mem_addr+31:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 #define _mm_load_ps1 _mm_load1_ps // Loads an single - precision, floating - point value into the low word and // clears the upper three words. // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_load_ss(const float *p) { return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); } // Loads a single single-precision, floating-point value, copying it into all // four words // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx FORCE_INLINE __m128 _mm_load1_ps(const float *p) { return vreinterpretq_m128_f32(vld1q_dup_f32(p)); } // Sets the upper two single-precision, floating-point values with 64 // bits of data loaded from the address p; the lower two values are passed // through from a. // // r0 := a0 // r1 := a1 // r2 := *p0 // r3 := *p1 // // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) { return vreinterpretq_m128_f32( vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); } // Sets the lower two single-precision, floating-point values with 64 // bits of data loaded from the address p; the upper two values are passed // through from a. // // Return Value // r0 := *p0 // r1 := *p1 // r2 := a2 // r3 := a3 // // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) { return vreinterpretq_m128_f32( vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); } // Load 4 single-precision (32-bit) floating-point elements from memory into dst // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // dst[31:0] := MEM[mem_addr+127:mem_addr+96] // dst[63:32] := MEM[mem_addr+95:mem_addr+64] // dst[95:64] := MEM[mem_addr+63:mem_addr+32] // dst[127:96] := MEM[mem_addr+31:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps FORCE_INLINE __m128 _mm_loadr_ps(const float *p) { float32x4_t v = vrev64q_f32(vld1q_f32(p)); return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); } // Loads four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_loadu_ps(const float *p) { // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are // equivalent for neon return vreinterpretq_m128_f32(vld1q_f32(p)); } // Load unaligned 16-bit integer from memory into the first element of dst. // // dst[15:0] := MEM[mem_addr+15:mem_addr] // dst[MAX:16] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 FORCE_INLINE __m128i _mm_loadu_si16(const void *p) { return vreinterpretq_m128i_s16( vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); } // Load unaligned 64-bit integer from memory into the first element of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[MAX:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 FORCE_INLINE __m128i _mm_loadu_si64(const void *p) { return vreinterpretq_m128i_s64( vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); } // Allocate aligned blocks of memory. // https://software.intel.com/en-us/ // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks FORCE_INLINE void *_mm_malloc(size_t size, size_t align) { void *ptr; if (align == 1) return MALLOC(size); if (align == 2 || (sizeof(void *) == 8 && align == 4)) align = sizeof(void *); if (!posix_memalign(&ptr, align, size)) return ptr; return NULL; } // Conditionally store 8-bit integer elements from a into memory using mask // (elements are not stored when the highest bit is not set in the corresponding // element) and a non-temporal memory hint. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64 FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) { int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7); __m128 b = _mm_load_ps((const float *) mem_addr); int8x8_t masked = vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a), vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b)))); vst1_s8((int8_t *) mem_addr, masked); } // Conditionally store 8-bit integer elements from a into memory using mask // (elements are not stored when the highest bit is not set in the corresponding // element) and a non-temporal memory hint. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) // Compare packed signed 16-bit integers in a and b, and store packed maximum // values in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Computes the maximums of the four single-precision, floating-point values of // a and b. // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) { #if SSE2NEON_PRECISE_MINMAX float32x4_t _a = vreinterpretq_f32_m128(a); float32x4_t _b = vreinterpretq_f32_m128(b); return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b)); #else return vreinterpretq_m128_f32( vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #endif } // Compare packed unsigned 8-bit integers in a and b, and store packed maximum // values in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Computes the maximum of the two lower scalar single-precision floating point // values of a and b. // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Compare packed signed 16-bit integers in a and b, and store packed minimum // values in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Computes the minima of the four single-precision, floating-point values of a // and b. // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) { #if SSE2NEON_PRECISE_MINMAX float32x4_t _a = vreinterpretq_f32_m128(a); float32x4_t _b = vreinterpretq_f32_m128(b); return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b)); #else return vreinterpretq_m128_f32( vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #endif } // Compare packed unsigned 8-bit integers in a and b, and store packed minimum // values in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Computes the minimum of the two lower scalar single-precision floating point // values of a and b. // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Sets the low word to the single-precision, floating-point value of b // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), vreinterpretq_f32_m128(a), 0)); } // Moves the upper two values of B into the lower two values of A. // // r3 := a3 // r2 := a2 // r1 := b3 // r0 := b2 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B)); return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); } // Moves the lower two values of B into the upper two values of A. // // r3 := b1 // r2 := b0 // r1 := a1 // r0 := a0 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); } // Create mask from the most significant bit of each 8-bit element in a, and // store the result in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8 FORCE_INLINE int _mm_movemask_pi8(__m64 a) { uint8x8_t input = vreinterpret_u8_m64(a); #if defined(__aarch64__) static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7}; uint8x8_t tmp = vshr_n_u8(input, 7); return vaddv_u8(vshl_u8(tmp, shift)); #else // Refer the implementation of `_mm_movemask_epi8` uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7)); uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); uint8x8_t paired32 = vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14)); return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4); #endif } // NEON does not provide this method // Creates a 4-bit mask from the most significant bits of the four // single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx FORCE_INLINE int _mm_movemask_ps(__m128 a) { uint32x4_t input = vreinterpretq_u32_m128(a); #if defined(__aarch64__) static const int32x4_t shift = {0, 1, 2, 3}; uint32x4_t tmp = vshrq_n_u32(input, 31); return vaddvq_u32(vshlq_u32(tmp, shift)); #else // Uses the exact same method as _mm_movemask_epi8, see that for details. // Shift out everything but the sign bits with a 32-bit unsigned shift // right. uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); // Merge the two pairs together with a 64-bit unsigned shift right + add. uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); // Extract the result. return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); #endif } // Multiplies the four single-precision, floating-point values of a and b. // // r0 := a0 * b0 // r1 := a1 * b1 // r2 := a2 * b2 // r3 := a3 * b3 // // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Multiply the lower single-precision (32-bit) floating-point element in a and // b, store the result in the lower element of dst, and copy the upper 3 packed // elements from a to the upper elements of dst. // // dst[31:0] := a[31:0] * b[31:0] // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_mul_ps(a, b)); } // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) { return vreinterpret_m64_u16(vshrn_n_u32( vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); } // Computes the bitwise OR of the four single-precision, floating-point values // of a and b. // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } // Average packed unsigned 8-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb #define _m_pavgb(a, b) _mm_avg_pu8(a, b) // Average packed unsigned 16-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw #define _m_pavgw(a, b) _mm_avg_pu16(a, b) // Extract a 16-bit integer from a, selected with imm8, and store the result in // the lower element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) // Compare packed signed 16-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw #define _m_pmaxsw(a, b) _mm_max_pi16(a, b) // Compare packed unsigned 8-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub #define _m_pmaxub(a, b) _mm_max_pu8(a, b) // Compare packed signed 16-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw #define _m_pminsw(a, b) _mm_min_pi16(a, b) // Compare packed unsigned 8-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub #define _m_pminub(a, b) _mm_min_pu8(a, b) // Create mask from the most significant bit of each 8-bit element in a, and // store the result in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb #define _m_pmovmskb(a) _mm_movemask_pi8(a) // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) // Loads one cache line of data from address p to a location closer to the // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx FORCE_INLINE void _mm_prefetch(const void *p, int i) { (void) i; __builtin_prefetch(p); } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce four // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw #define _m_psadbw(a, b) _mm_sad_pu8(a, b) // Shuffle 16-bit integers in a using the control in imm8, and store the results // in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm) // Compute the approximate reciprocal of packed single-precision (32-bit) // floating-point elements in a, and store the results in dst. The maximum // relative error for this approximation is less than 1.5*2^-12. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) { float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); #if SSE2NEON_PRECISE_DIV // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); #endif return vreinterpretq_m128_f32(recip); } // Compute the approximate reciprocal of the lower single-precision (32-bit) // floating-point element in a, store the result in the lower element of dst, // and copy the upper 3 packed elements from a to the upper elements of dst. The // maximum relative error for this approximation is less than 1.5*2^-12. // // dst[31:0] := (1.0 / a[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) { return _mm_move_ss(a, _mm_rcp_ps(a)); } // Computes the approximations of the reciprocal square roots of the four // single-precision floating point values of in. // The current precision is 1% error. // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) { float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); #if SSE2NEON_PRECISE_SQRT // Additional Netwon-Raphson iteration for accuracy out = vmulq_f32( out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); out = vmulq_f32( out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); #endif return vreinterpretq_m128_f32(out); } // Compute the approximate reciprocal square root of the lower single-precision // (32-bit) floating-point element in a, store the result in the lower element // of dst, and copy the upper 3 packed elements from a to the upper elements of // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) { return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce four // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) { uint64x1_t t = vpaddl_u32(vpaddl_u16( vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))))); return vreinterpret_m64_u16( vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0)); } // Macro: Set the flush zero bits of the MXCSR control and status register to // the value in unsigned 32-bit integer a. The flush zero may contain any of the // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON; #if defined(__aarch64__) __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */ #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } // Sets the four single-precision, floating-point values to the four inputs. // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) { float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Sets the four single-precision, floating-point values to w. // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx FORCE_INLINE __m128 _mm_set_ps1(float _w) { return vreinterpretq_m128_f32(vdupq_n_f32(_w)); } // Macro: Set the rounding mode bits of the MXCSR control and status register to // the value in unsigned 32-bit integer a. The rounding mode may contain any of // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, // _MM_ROUND_TOWARD_ZERO // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif switch (rounding) { case _MM_ROUND_TOWARD_ZERO: r.field.bit22 = 1; r.field.bit23 = 1; break; case _MM_ROUND_DOWN: r.field.bit22 = 0; r.field.bit23 = 1; break; case _MM_ROUND_UP: r.field.bit22 = 1; r.field.bit23 = 0; break; default: //_MM_ROUND_NEAREST r.field.bit22 = 0; r.field.bit23 = 0; } #if defined(__aarch64__) __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */ #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } // Copy single-precision (32-bit) floating-point element a to the lower element // of dst, and zero the upper 3 elements. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss FORCE_INLINE __m128 _mm_set_ss(float a) { float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Sets the four single-precision, floating-point values to w. // // r0 := r1 := r2 := r3 := w // // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx FORCE_INLINE __m128 _mm_set1_ps(float _w) { return vreinterpretq_m128_f32(vdupq_n_f32(_w)); } // FIXME: _mm_setcsr() implementation supports changing the rounding mode only. FORCE_INLINE void _mm_setcsr(unsigned int a) { _MM_SET_ROUNDING_MODE(a); } // FIXME: _mm_getcsr() implementation supports reading the rounding mode only. FORCE_INLINE unsigned int _mm_getcsr() { return _MM_GET_ROUNDING_MODE(); } // Sets the four single-precision, floating-point values to the four inputs in // reverse order. // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) { float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Clears the four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx FORCE_INLINE __m128 _mm_setzero_ps(void) { return vreinterpretq_m128_f32(vdupq_n_f32(0)); } // Shuffle 16-bit integers in a using the control in imm8, and store the results // in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16 #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_pi16(a, imm) \ __extension__({ \ vreinterpret_m64_s16(__builtin_shufflevector( \ vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \ }) #else #define _mm_shuffle_pi16(a, imm) \ __extension__({ \ int16x4_t ret; \ ret = \ vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \ ret = vset_lane_s16( \ vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \ 1); \ ret = vset_lane_s16( \ vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \ 2); \ ret = vset_lane_s16( \ vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \ 3); \ vreinterpret_m64_s16(ret); \ }) #endif // Guarantees that every preceding store is globally visible before any // subsequent store. // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx FORCE_INLINE void _mm_sfence(void) { __sync_synchronize(); } // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) // int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_ps(a, b, imm) \ __extension__({ \ float32x4_t _input1 = vreinterpretq_f32_m128(a); \ float32x4_t _input2 = vreinterpretq_f32_m128(b); \ float32x4_t _shuf = __builtin_shufflevector( \ _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ vreinterpretq_m128_f32(_shuf); \ }) #else // generic #define _mm_shuffle_ps(a, b, imm) \ __extension__({ \ __m128 ret; \ switch (imm) { \ case _MM_SHUFFLE(1, 0, 3, 2): \ ret = _mm_shuffle_ps_1032((a), (b)); \ break; \ case _MM_SHUFFLE(2, 3, 0, 1): \ ret = _mm_shuffle_ps_2301((a), (b)); \ break; \ case _MM_SHUFFLE(0, 3, 2, 1): \ ret = _mm_shuffle_ps_0321((a), (b)); \ break; \ case _MM_SHUFFLE(2, 1, 0, 3): \ ret = _mm_shuffle_ps_2103((a), (b)); \ break; \ case _MM_SHUFFLE(1, 0, 1, 0): \ ret = _mm_movelh_ps((a), (b)); \ break; \ case _MM_SHUFFLE(1, 0, 0, 1): \ ret = _mm_shuffle_ps_1001((a), (b)); \ break; \ case _MM_SHUFFLE(0, 1, 0, 1): \ ret = _mm_shuffle_ps_0101((a), (b)); \ break; \ case _MM_SHUFFLE(3, 2, 1, 0): \ ret = _mm_shuffle_ps_3210((a), (b)); \ break; \ case _MM_SHUFFLE(0, 0, 1, 1): \ ret = _mm_shuffle_ps_0011((a), (b)); \ break; \ case _MM_SHUFFLE(0, 0, 2, 2): \ ret = _mm_shuffle_ps_0022((a), (b)); \ break; \ case _MM_SHUFFLE(2, 2, 0, 0): \ ret = _mm_shuffle_ps_2200((a), (b)); \ break; \ case _MM_SHUFFLE(3, 2, 0, 2): \ ret = _mm_shuffle_ps_3202((a), (b)); \ break; \ case _MM_SHUFFLE(3, 2, 3, 2): \ ret = _mm_movehl_ps((b), (a)); \ break; \ case _MM_SHUFFLE(1, 1, 3, 3): \ ret = _mm_shuffle_ps_1133((a), (b)); \ break; \ case _MM_SHUFFLE(2, 0, 1, 0): \ ret = _mm_shuffle_ps_2010((a), (b)); \ break; \ case _MM_SHUFFLE(2, 0, 0, 1): \ ret = _mm_shuffle_ps_2001((a), (b)); \ break; \ case _MM_SHUFFLE(2, 0, 3, 2): \ ret = _mm_shuffle_ps_2032((a), (b)); \ break; \ default: \ ret = _mm_shuffle_ps_default((a), (b), (imm)); \ break; \ } \ ret; \ }) #endif // Computes the approximations of square roots of the four single-precision, // floating-point values of a. First computes reciprocal square roots and then // reciprocals of the four values. // // r0 := SQRT(a0) // r1 := SQRT(a1) // r2 := SQRT(a2) // r3 := SQRT(a3) // // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) { #if SSE2NEON_PRECISE_SQRT float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); // Test for vrsqrteq_f32(0) -> positive infinity case. // Change to zero, so that s * 1/SQRT(s) result is zero too. const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); const uint32x4_t div_by_zero = vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); recip = vreinterpretq_f32_u32( vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32( vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), recip); recip = vmulq_f32( vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), recip); // SQRT(s) = s * 1/SQRT(s) return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); #elif defined(__aarch64__) return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); #else float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); float32x4_t sq = vrecpeq_f32(recipsq); return vreinterpretq_m128_f32(sq); #endif } // Computes the approximation of the square root of the scalar single-precision // floating point value of in. // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) { float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); } // Stores four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx FORCE_INLINE void _mm_store_ps(float *p, __m128 a) { vst1q_f32(p, vreinterpretq_f32_m128(a)); } // Store the lower single-precision (32-bit) floating-point element from a into // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // // MEM[mem_addr+31:mem_addr] := a[31:0] // MEM[mem_addr+63:mem_addr+32] := a[31:0] // MEM[mem_addr+95:mem_addr+64] := a[31:0] // MEM[mem_addr+127:mem_addr+96] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) { float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); vst1q_f32(p, vdupq_n_f32(a0)); } // Stores the lower single - precision, floating - point value. // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx FORCE_INLINE void _mm_store_ss(float *p, __m128 a) { vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); } // Store the lower single-precision (32-bit) floating-point element from a into // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // // MEM[mem_addr+31:mem_addr] := a[31:0] // MEM[mem_addr+63:mem_addr+32] := a[31:0] // MEM[mem_addr+95:mem_addr+64] := a[31:0] // MEM[mem_addr+127:mem_addr+96] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps #define _mm_store1_ps _mm_store_ps1 // Stores the upper two single-precision, floating-point values of a to the // address p. // // *p0 := a2 // *p1 := a3 // // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) { *p = vreinterpret_m64_f32(vget_high_f32(a)); } // Stores the lower two single-precision floating point values of a to the // address p. // // *p0 := a0 // *p1 := a1 // // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) { *p = vreinterpret_m64_f32(vget_low_f32(a)); } // Store 4 single-precision (32-bit) floating-point elements from a into memory // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // MEM[mem_addr+31:mem_addr] := a[127:96] // MEM[mem_addr+63:mem_addr+32] := a[95:64] // MEM[mem_addr+95:mem_addr+64] := a[63:32] // MEM[mem_addr+127:mem_addr+96] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) { float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); float32x4_t rev = vextq_f32(tmp, tmp, 2); vst1q_f32(p, rev); } // Stores four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) { vst1q_f32(p, vreinterpretq_f32_m128(a)); } // Stores 16-bits of integer data a at the address p. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16 FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) { vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0); } // Stores 64-bits of integer data a at the address p. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64 FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) { vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0); } // Store 64-bits of integer data from a into memory using a non-temporal memory // hint. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) { vst1_s64((int64_t *) p, vreinterpret_s64_m64(a)); } // Store 128-bits (composed of 4 packed single-precision (32-bit) floating- // point elements) from a into memory using a non-temporal memory hint. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, (float32x4_t *) p); #else vst1q_f32(p, vreinterpretq_f32_m128(a)); #endif } // Subtracts the four single-precision, floating-point values of a and b. // // r0 := a0 - b0 // r1 := a1 - b1 // r2 := a2 - b2 // r3 := a3 - b3 // // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Subtract the lower single-precision (32-bit) floating-point element in b from // the lower single-precision (32-bit) floating-point element in a, store the // result in the lower element of dst, and copy the upper 3 packed elements from // a to the upper elements of dst. // // dst[31:0] := a[31:0] - b[31:0] // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_sub_ps(a, b)); } // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the // transposed matrix in these vectors (row0 now contains column 0, etc.). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ do { \ float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ vget_low_f32(ROW23.val[0])); \ row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ vget_low_f32(ROW23.val[1])); \ row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ vget_high_f32(ROW23.val[0])); \ row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ vget_high_f32(ROW23.val[1])); \ } while (0) // according to the documentation, these intrinsics behave the same as the // non-'u' versions. We'll just alias them here. #define _mm_ucomieq_ss _mm_comieq_ss #define _mm_ucomige_ss _mm_comige_ss #define _mm_ucomigt_ss _mm_comigt_ss #define _mm_ucomile_ss _mm_comile_ss #define _mm_ucomilt_ss _mm_comilt_ss #define _mm_ucomineq_ss _mm_comineq_ss // Return vector of type __m128i with undefined elements. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128 FORCE_INLINE __m128i _mm_undefined_si128(void) { #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic push //#pragma GCC diagnostic ignored "-Wuninitialized" #endif // __m128i a; __m128i a = ZERO_SSE2NEON(); BUG_SSE2NEON; return a; #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic pop #endif } // Return vector of type __m128 with undefined elements. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps FORCE_INLINE __m128 _mm_undefined_ps(void) { #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic push //#pragma GCC diagnostic ignored "-Wuninitialized" #endif // __m128 a; __m128 a = (__m128) ZERO_SSE2NEON(); BUG_SSE2NEON; return a; #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic pop #endif } // Selects and interleaves the upper two single-precision, floating-point values // from a and b. // // r0 := a2 // r1 := b2 // r2 := a3 // r3 := b3 // // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) { #if defined(__aarch64__) return vreinterpretq_m128_f32( vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); float32x2x2_t result = vzip_f32(a1, b1); return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); #endif } // Selects and interleaves the lower two single-precision, floating-point values // from a and b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) { #if defined(__aarch64__) return vreinterpretq_m128_f32( vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); float32x2x2_t result = vzip_f32(a1, b1); return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); #endif } // Computes bitwise EXOR (exclusive-or) of the four single-precision, // floating-point values of a and b. // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } /* SSE2 */ // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or // unsigned 16-bit integers in b. // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or // unsigned 32-bit integers in b. // // r0 := a0 + b0 // r1 := a1 + b1 // r2 := a2 + b2 // r3 := a3 + b3 // // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or // unsigned 32-bit integers in b. // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) { return vreinterpretq_m128i_s64( vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); } // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or // unsigned 8-bit integers in b. // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Add packed double-precision (64-bit) floating-point elements in a and b, and // store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] + db[0]; c[1] = da[1] + db[1]; return vld1q_f32((float32_t *) c); #endif } // Add the lower double-precision (64-bit) floating-point element in a and b, // store the result in the lower element of dst, and copy the upper element from // a to the upper element of dst. // // dst[63:0] := a[63:0] + b[63:0] // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_add_pd(a, b)); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] + db[0]; c[1] = da[1]; return vld1q_f32((float32_t *) c); #endif } // Add 64-bit integers a and b, and store the result in dst. // // dst[63:0] := a[63:0] + b[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) { return vreinterpret_m64_s64( vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); } // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b // and saturates. // // r0 := SignedSaturate(a0 + b0) // r1 := SignedSaturate(a1 + b1) // ... // r7 := SignedSaturate(a7 + b7) // // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Add packed signed 8-bit integers in a and b using saturation, and store the // results in dst. // // FOR j := 0 to 15 // i := j*8 // dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Add packed unsigned 16-bit integers in a and b using saturation, and store // the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in // b and saturates.. // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Compute the bitwise AND of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := a[i+63:i] AND b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in // b. // // r := a & b // // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compute the bitwise NOT of packed double-precision (64-bit) floating-point // elements in a and then AND with b, and store the results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) { // *NOTE* argument swap return vreinterpretq_m128d_s64( vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); } // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the // 128-bit value in a. // // r := (~a) & b // // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vbicq_s32(vreinterpretq_s32_m128i(b), vreinterpretq_s32_m128i(a))); // *NOTE* argument swap } // Computes the average of the 8 unsigned 16-bit integers in a and the 8 // unsigned 16-bit integers in b and rounds. // // r0 := (a0 + b0) / 2 // r1 := (a1 + b1) / 2 // ... // r7 := (a7 + b7) / 2 // // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) { return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); } // Computes the average of the 16 unsigned 8-bit integers in a and the 16 // unsigned 8-bit integers in b and rounds. // // r0 := (a0 + b0) / 2 // r1 := (a1 + b1) / 2 // ... // r15 := (a15 + b15) / 2 // // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Shift a left by imm8 bytes while shifting in zeros, and store the results in // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) // Shift a right by imm8 bytes while shifting in zeros, and store the results in // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) // Cast vector of type __m128d to type __m128. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) { return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); } // Cast vector of type __m128d to type __m128i. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) { return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); } // Cast vector of type __m128 to type __m128d. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd FORCE_INLINE __m128d _mm_castps_pd(__m128 a) { return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); } // Applies a type cast to reinterpret four 32-bit floating point values passed // in as a 128-bit parameter as packed 32-bit integers. // https://msdn.microsoft.com/en-us/library/bb514099.aspx FORCE_INLINE __m128i _mm_castps_si128(__m128 a) { return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); } // Cast vector of type __m128i to type __m128d. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); #else return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); #endif } // Applies a type cast to reinterpret four 32-bit integers passed in as a // 128-bit parameter as packed 32-bit floating point values. // https://msdn.microsoft.com/en-us/library/bb514029.aspx FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) { return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); } // Cache line containing p is flushed and invalidated from all caches in the // coherency domain. : // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx FORCE_INLINE void _mm_clflush(void const *p) { (void) p; // no corollary for Neon? } // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or // unsigned 16-bit integers in b for equality. // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compare packed 32-bit integers in a and b for equality, and store the results // in dst FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or // unsigned 8-bit integers in b for equality. // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for equality, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for equality, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpeq_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for greater-than-or-equal, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for greater-than-or-equal, store the result in the lower element of dst, // and copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmpge_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers // in b for greater than. // // r0 := (a0 > b0) ? 0xffff : 0x0 // r1 := (a1 > b1) ? 0xffff : 0x0 // ... // r7 := (a7 > b7) ? 0xffff : 0x0 // // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers // in b for greater than. // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers // in b for greater than. // // r0 := (a0 > b0) ? 0xff : 0x0 // r1 := (a1 > b1) ? 0xff : 0x0 // ... // r15 := (a15 > b15) ? 0xff : 0x0 // // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for greater-than, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for greater-than, store the result in the lower element of dst, and copy // the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed double-precision (64-bit) floating-point elements in a and b // for less-than-or-equal, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for less-than-or-equal, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmple_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers // in b for less than. // // r0 := (a0 < b0) ? 0xffff : 0x0 // r1 := (a1 < b1) ? 0xffff : 0x0 // ... // r7 := (a7 < b7) ? 0xffff : 0x0 // // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers // in b for less than. // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers // in b for lesser than. // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for less-than, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for less-than, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmplt_pd(a, b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-equal, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); #else // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped))); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-equal, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpneq_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-greater-than-or-equal, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64(veorq_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-greater-than-or-equal, store the result in the lower element of // dst, and copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpnge_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-greater-than, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64(veorq_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-greater-than, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpngt_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-less-than-or-equal, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64(veorq_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-less-than-or-equal, store the result in the lower element of dst, // and copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpnle_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-less-than, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64(veorq_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-less-than, store the result in the lower element of dst, and copy // the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpnlt_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // to see if neither is NaN, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) { #if defined(__aarch64__) // Excluding NaNs, any two floating point numbers can be compared. uint64x2_t not_nan_a = vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); uint64x2_t not_nan_b = vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = ((*(double *) &a1) == (*(double *) &a1) && (*(double *) &b1) == (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b to see if neither is NaN, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmpord_pd(a, b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed double-precision (64-bit) floating-point elements in a and b // to see if either is NaN, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) { #if defined(__aarch64__) // Two NaNs are not equal in comparison operation. uint64x2_t not_nan_a = vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); uint64x2_t not_nan_b = vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_s32( vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? UINT64_C(0) : ~UINT64_C(0); d[1] = ((*(double *) &a1) == (*(double *) &a1) && (*(double *) &b1) == (*(double *) &b1)) ? UINT64_C(0) : ~UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b to see if either is NaN, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? UINT64_C(0) : ~UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for greater-than-or-equal, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 >= *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for greater-than, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 > *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for less-than-or-equal, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 <= *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for less-than, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 < *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for equality, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1; #else uint32x4_t a_not_nan = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a)); uint32x4_t b_not_nan = vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b)); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_eq_b = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan), vreinterpretq_u64_u32(a_eq_b)); return vgetq_lane_u64(and_results, 0) & 0x1; #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for not-equal, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) { return !_mm_comieq_sd(a, b); } // Convert packed signed 32-bit integers in a to packed double-precision // (64-bit) floating-point elements, and store the results in dst. // // FOR j := 0 to 1 // i := j*32 // m := j*64 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); #else double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1); return _mm_set_pd(a1, a0); #endif } // Converts the four signed 32-bit integer values of a to single-precision, // floating-point values // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) { return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // k := 64*j // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32 FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double d0 = ((double *) &rnd)[0]; double d1 = ((double *) &rnd)[1]; return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // k := 64*j // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double d0 = ((double *) &rnd)[0]; double d1 = ((double *) &rnd)[1]; int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; return vreinterpret_m64_s32(vld1_s32(data)); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed single-precision (32-bit) floating-point elements, and store the // results in dst. // // FOR j := 0 to 1 // i := 32*j // k := 64*j // dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) // ENDFOR // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) { #if defined(__aarch64__) float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); #else float a0 = (float) ((double *) &a)[0]; float a1 = (float) ((double *) &a)[1]; return _mm_set_ps(0, 0, a1, a0); #endif } // Convert packed signed 32-bit integers in a to packed double-precision // (64-bit) floating-point elements, and store the results in dst. // // FOR j := 0 to 1 // i := j*32 // m := j*64 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); #else double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0); double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1); return _mm_set_pd(a1, a0); #endif } // Converts the four single-precision, floating-point values of a to signed // 32-bit integer values. // // r0 := (int) a0 // r1 := (int) a1 // r2 := (int) a2 // r3 := (int) a3 // // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A // does not support! It is supported on ARMv8-A however. FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) { #if defined(__aarch64__) switch (_MM_GET_ROUNDING_MODE()) { case _MM_ROUND_NEAREST: return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); case _MM_ROUND_DOWN: return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a)); case _MM_ROUND_UP: return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a)); default: // _MM_ROUND_TOWARD_ZERO return vreinterpretq_m128i_s32(vcvtq_s32_f32(a)); } #else float *f = (float *) &a; switch (_MM_GET_ROUNDING_MODE()) { case _MM_ROUND_NEAREST: { uint32x4_t signmask = vdupq_n_u32(0x80000000); float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */ int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ int32x4_t r_trunc = vcvtq_s32_f32( vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ float32x4_t delta = vsubq_f32( vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ return vreinterpretq_m128i_s32( vbslq_s32(is_delta_half, r_even, r_normal)); } case _MM_ROUND_DOWN: return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); case _MM_ROUND_UP: return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); default: // _MM_ROUND_TOWARD_ZERO return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1], (int32_t) f[0]); } #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed double-precision (64-bit) floating-point elements, and store the // results in dst. // // FOR j := 0 to 1 // i := 64*j // k := 32*j // dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); #else double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); return _mm_set_pd(a1, a0); #endif } // Copy the lower double-precision (64-bit) floating-point element of a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { #if defined(__aarch64__) return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); #else return ((double *) &a)[0]; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // // dst[31:0] := Convert_FP64_To_Int32(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) { #if defined(__aarch64__) return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double ret = ((double *) &rnd)[0]; return (int32_t) ret; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int64(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) { #if defined(__aarch64__) return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double ret = ((double *) &rnd)[0]; return (int64_t) ret; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int64(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x #define _mm_cvtsd_si64x _mm_cvtsd_si64 // Convert the lower double-precision (64-bit) floating-point element in b to a // single-precision (32-bit) floating-point element, store the result in the // lower element of dst, and copy the upper 3 packed elements from a to the // upper elements of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128_f32(vsetq_lane_f32( vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), vreinterpretq_f32_m128(a), 0)); #else return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0], vreinterpretq_f32_m128(a), 0)); #endif } // Copy the lower 32-bit integer in a to dst. // // dst[31:0] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) { return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); } // Copy the lower 64-bit integer in a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) { return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); } // Copy the lower 64-bit integer in a to dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) // Convert the signed 32-bit integer b to a double-precision (64-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else double bf = (double) b; return vreinterpretq_m128d_s64( vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); #endif } // Copy the lower 64-bit integer in a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) // Moves 32-bit integer a to the least significant 32 bits of an __m128 object, // zero extending the upper bits. // // r0 := a // r1 := 0x0 // r2 := 0x0 // r3 := 0x0 // // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) { return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); } // Convert the signed 64-bit integer b to a double-precision (64-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else double bf = (double) b; return vreinterpretq_m128d_s64( vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); #endif } // Moves 64-bit integer a to the least significant 64 bits of an __m128 object, // zero extending the upper bits. // // r0 := a // r1 := 0x0 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) { return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); } // Copy 64-bit integer a to the lower element of dst, and zero the upper // element. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) // Convert the signed 64-bit integer b to a double-precision (64-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) // Convert the lower single-precision (32-bit) floating-point element in b to a // double-precision (64-bit) floating-point element, store the result in the // lower element of dst, and copy the upper element from a to the upper element // of dst. // // dst[63:0] := Convert_FP32_To_FP64(b[31:0]) // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) { double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); #if defined(__aarch64__) return vreinterpretq_m128d_f64( vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); #else return vreinterpretq_m128d_s64( vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); #endif } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) { double a0 = ((double *) &a)[0]; double a1 = ((double *) &a)[1]; return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) { double a0 = ((double *) &a)[0]; double a1 = ((double *) &a)[1]; int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; return vreinterpret_m64_s32(vld1_s32(data)); } // Converts the four single-precision, floating-point values of a to signed // 32-bit integer values using truncate. // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) { return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); } // Convert the lower double-precision (64-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) { double ret = *((double *) &a); return (int32_t) ret; } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) { #if defined(__aarch64__) return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); #else double ret = *((double *) &a); return (int64_t) ret; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) // Divide packed double-precision (64-bit) floating-point elements in a by // packed elements in b, and store the results in dst. // // FOR j := 0 to 1 // i := 64*j // dst[i+63:i] := a[i+63:i] / b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] / db[0]; c[1] = da[1] / db[1]; return vld1q_f32((float32_t *) c); #endif } // Divide the lower double-precision (64-bit) floating-point element in a by the // lower double-precision (64-bit) floating-point element in b, store the result // in the lower element of dst, and copy the upper element from a to the upper // element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) { #if defined(__aarch64__) float64x2_t tmp = vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_f64( vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); #else return _mm_move_sd(a, _mm_div_pd(a, b)); #endif } // Extracts the selected signed or unsigned 16-bit integer from a and zero // extends. // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) #define _mm_extract_epi16(a, imm) \ vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) // Inserts the least significant 16 bits of b into the selected 16-bit integer // of a. // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, // __constrange(0,8) int imm) #define _mm_insert_epi16(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s16( \ vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ }) // Loads two double-precision from 16-byte aligned memory, floating-point // values. // // dst[127:0] := MEM[mem_addr+127:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd FORCE_INLINE __m128d _mm_load_pd(const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vld1q_f64(p)); #else const float *fp = (const float *) p; float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif } // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 #define _mm_load_pd1 _mm_load1_pd // Load a double-precision (64-bit) floating-point element from memory into the // lower of dst, and zero the upper element. mem_addr does not need to be // aligned on any particular boundary. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd FORCE_INLINE __m128d _mm_load_sd(const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); #else const float *fp = (const float *) p; float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif } // Loads 128-bit value. : // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) { return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); } // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd FORCE_INLINE __m128d _mm_load1_pd(const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); #else return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); #endif } // Load a double-precision (64-bit) floating-point element from memory into the // upper element of dst, and copy the lower element from a to dst. mem_addr does // not need to be aligned on any particular boundary. // // dst[63:0] := a[63:0] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); #else return vreinterpretq_m128d_f32(vcombine_f32( vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); #endif } // Load 64-bit integer from memory into the first element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) { /* Load the lower 64 bits of the value pointed to by p into the * lower 64 bits of the result, zeroing the upper 64 bits of the result. */ return vreinterpretq_m128i_s32( vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); } // Load a double-precision (64-bit) floating-point element from memory into the // lower element of dst, and copy the upper element from a to dst. mem_addr does // not need to be aligned on any particular boundary. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); #else return vreinterpretq_m128d_f32( vcombine_f32(vld1_f32((const float *) p), vget_high_f32(vreinterpretq_f32_m128d(a)))); #endif } // Load 2 double-precision (64-bit) floating-point elements from memory into dst // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // dst[63:0] := MEM[mem_addr+127:mem_addr+64] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd FORCE_INLINE __m128d _mm_loadr_pd(const double *p) { #if defined(__aarch64__) float64x2_t v = vld1q_f64(p); return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); #else int64x2_t v = vld1q_s64((const int64_t *) p); return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); #endif } // Loads two double-precision from unaligned memory, floating-point values. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd FORCE_INLINE __m128d _mm_loadu_pd(const double *p) { return _mm_load_pd(p); } // Loads 128-bit value. : // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) { return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); } // Load unaligned 32-bit integer from memory into the first element of dst. // // dst[31:0] := MEM[mem_addr+31:mem_addr] // dst[MAX:32] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 FORCE_INLINE __m128i _mm_loadu_si32(const void *p) { return vreinterpretq_m128i_s32( vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); } // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit // integers from b. // // r0 := (a0 * b0) + (a1 * b1) // r1 := (a2 * b2) + (a3 * b3) // r2 := (a4 * b4) + (a5 * b5) // r3 := (a6 * b6) + (a7 * b7) // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) { int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), vget_low_s16(vreinterpretq_s16_m128i(b))); int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), vget_high_s16(vreinterpretq_s16_m128i(b))); int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); } // Conditionally store 8-bit integer elements from a into memory using mask // (elements are not stored when the highest bit is not set in the corresponding // element) and a non-temporal memory hint. mem_addr does not need to be aligned // on any particular boundary. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128 FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) { int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7); __m128 b = _mm_load_ps((const float *) mem_addr); int8x16_t masked = vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128(b)); vst1q_s8((int8_t *) mem_addr, masked); } // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 // signed 16-bit integers from b. // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the // 16 unsigned 8-bit integers from b. // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b, // and store packed maximum values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) { #if defined(__aarch64__) #if SSE2NEON_PRECISE_MINMAX float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b)); #else return vreinterpretq_m128d_f64( vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0; d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b, store the maximum value in the lower element of dst, and copy the upper // element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_max_pd(a, b)); #else double *da = (double *) &a; double *db = (double *) &b; double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 // signed 16-bit integers from b. // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the // 16 unsigned 8-bit integers from b. // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b, // and store packed minimum values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) { #if defined(__aarch64__) #if SSE2NEON_PRECISE_MINMAX float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b)); #else return vreinterpretq_m128d_f64( vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0; d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b, store the minimum value in the lower element of dst, and copy the upper // element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_min_pd(a, b)); #else double *da = (double *) &a; double *db = (double *) &b; double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } // Copy the lower 64-bit integer in a to the lower element of dst, and zero the // upper element. // // dst[63:0] := a[63:0] // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 FORCE_INLINE __m128i _mm_move_epi64(__m128i a) { return vreinterpretq_m128i_s64( vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); } // Move the lower double-precision (64-bit) floating-point element from b to the // lower element of dst, and copy the upper element from a to the upper element // of dst. // // dst[63:0] := b[63:0] // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) { return vreinterpretq_m128d_f32( vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), vget_high_f32(vreinterpretq_f32_m128d(a)))); } // NEON does not provide a version of this function. // Creates a 16-bit mask from the most significant bits of the 16 signed or // unsigned 8-bit integers in a and zero extends the upper bits. // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx FORCE_INLINE int _mm_movemask_epi8(__m128i a) { // Use increasingly wide shifts+adds to collect the sign bits // together. // Since the widening shifts would be rather confusing to follow in little // endian, everything will be illustrated in big endian order instead. This // has a different result - the bits would actually be reversed on a big // endian machine. // Starting input (only half the elements are shown): // 89 ff 1d c0 00 10 99 33 uint8x16_t input = vreinterpretq_u8_m128i(a); // Shift out everything but the sign bits with an unsigned shift right. // // Bytes of the vector:: // 89 ff 1d c0 00 10 99 33 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) // | | | | | | | | // 01 01 00 01 00 00 01 00 // // Bits of first important lane(s): // 10001001 (89) // \______ // | // 00000001 (01) uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); // Merge the even lanes together with a 16-bit unsigned shift right + add. // 'xx' represents garbage data which will be ignored in the final result. // In the important bytes, the add functions like a binary OR. // // 01 01 00 01 00 00 01 00 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) // \| \| \| \| // xx 03 xx 01 xx 00 xx 02 // // 00000001 00000001 (01 01) // \_______ | // \| // xxxxxxxx xxxxxx11 (xx 03) uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); // Repeat with a wider 32-bit shift + add. // xx 03 xx 01 xx 00 xx 02 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> // 14)) // \| \| // xx xx xx 0d xx xx xx 02 // // 00000011 00000001 (03 01) // \\_____ || // '----.\|| // xxxxxxxx xxxx1101 (xx 0d) uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); // Last, an even wider 64-bit shift + add to get our result in the low 8 bit // lanes. xx xx xx 0d xx xx xx 02 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> // 28)) // \| // xx xx xx xx xx xx xx d2 // // 00001101 00000010 (0d 02) // \ \___ | | // '---. \| | // xxxxxxxx 11010010 (xx d2) uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. // xx xx xx xx xx xx xx d2 // || return paired64[0] // d2 // Note: Little endian would return the correct value 4b (01001011) instead. return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); } // Set each bit of mask dst based on the most significant bit of the // corresponding packed double-precision (64-bit) floating-point element in a. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd FORCE_INLINE int _mm_movemask_pd(__m128d a) { uint64x2_t input = vreinterpretq_u64_m128d(a); uint64x2_t high_bits = vshrq_n_u64(input, 63); return (int) (vgetq_lane_u64(high_bits, 0)) | ((int) (vgetq_lane_u64(high_bits, 1) << 1)); } // Copy the lower 64-bit integer in a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) { return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); } // Copy the 64-bit integer a to the lower element of dst, and zero the upper // element. // // dst[63:0] := a[63:0] // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) { return vreinterpretq_m128i_s64( vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); } // Multiply the low unsigned 32-bit integers from each packed 64-bit element in // a and b, and store the unsigned 64-bit results in dst. // // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) { // vmull_u32 upcasts instead of masking, so we downcast. uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); } // Multiply packed double-precision (64-bit) floating-point elements in a and b, // and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] * db[0]; c[1] = da[1] * db[1]; return vld1q_f32((float32_t *) c); #endif } // Multiply the lower double-precision (64-bit) floating-point element in a and // b, store the result in the lower element of dst, and copy the upper element // from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_mul_pd(a, b)); } // Multiply the low unsigned 32-bit integers from a and b, and store the // unsigned 64-bit result in dst. // // dst[63:0] := a[31:0] * b[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) { return vreinterpret_m64_u64(vget_low_u64( vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); } // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit // integers from b. // // r0 := (a0 * b0)[31:16] // r1 := (a1 * b1)[31:16] // ... // r7 := (a7 * b7)[31:16] // // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) { /* FIXME: issue with large values because of result saturation */ // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); return vreinterpretq_m128i_u16(r.val[1]); } // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) { uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); uint32x4_t ab3210 = vmull_u16(a3210, b3210); #if defined(__aarch64__) uint32x4_t ab7654 = vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); return vreinterpretq_m128i_u16(r); #else uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); uint32x4_t ab7654 = vmull_u16(a7654, b7654); uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); return vreinterpretq_m128i_u16(r.val[1]); #endif } // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or // unsigned 16-bit integers from b. // // r0 := (a0 * b0)[15:0] // r1 := (a1 * b1)[15:0] // ... // r7 := (a7 * b7)[15:0] // // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compute the bitwise OR of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. // // r := a | b // // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and // saturates. // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), vqmovn_s16(vreinterpretq_s16_m128i(b)))); } // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers // and saturates. // // r0 := SignedSaturate(a0) // r1 := SignedSaturate(a1) // r2 := SignedSaturate(a2) // r3 := SignedSaturate(a3) // r4 := SignedSaturate(b0) // r5 := SignedSaturate(b1) // r6 := SignedSaturate(b2) // r7 := SignedSaturate(b3) // // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), vqmovn_s32(vreinterpretq_s32_m128i(b)))); } // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned // integers and saturates. // // r0 := UnsignedSaturate(a0) // r1 := UnsignedSaturate(a1) // ... // r7 := UnsignedSaturate(a7) // r8 := UnsignedSaturate(b0) // r9 := UnsignedSaturate(b1) // ... // r15 := UnsignedSaturate(b7) // // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) { return vreinterpretq_m128i_u8( vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), vqmovun_s16(vreinterpretq_s16_m128i(b)))); } // Pause the processor. This is typically used in spin-wait loops and depending // on the x86 processor typical values are in the 40-100 cycle range. The // 'yield' instruction isn't a good fit because it's effectively a nop on most // Arm cores. Experience with several databases has shown has shown an 'isb' is // a reasonable approximation. FORCE_INLINE void _mm_pause() { __asm__ __volatile__("isb\n"); } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce two // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of 64-bit elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) { uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t))); } // Sets the 8 signed 16-bit integer values. // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0) { int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; return vreinterpretq_m128i_s16(vld1q_s16(data)); } // Sets the 4 signed 32-bit integer values. // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) { int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; return vreinterpretq_m128i_s32(vld1q_s32(data)); } // Returns the __m128i structure with its two 64-bit integer values // initialized to the values of the two 64-bit integers passed in. // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) { return _mm_set_epi64x((int64_t) i1, (int64_t) i2); } // Returns the __m128i structure with its two 64-bit integer values // initialized to the values of the two 64-bit integers passed in. // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) { return vreinterpretq_m128i_s64( vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); } // Sets the 16 signed 8-bit integer values. // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0) { int8_t ALIGN_STRUCT(16) data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } // Set packed double-precision (64-bit) floating-point elements in dst with the // supplied values. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) { double ALIGN_STRUCT(16) data[2] = {e0, e1}; #if defined(__aarch64__) return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); #else return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); #endif } // Broadcast double-precision (64-bit) floating-point value a to all elements of // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1 #define _mm_set_pd1 _mm_set1_pd // Copy double-precision (64-bit) floating-point element a to the lower element // of dst, and zero the upper element. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd FORCE_INLINE __m128d _mm_set_sd(double a) { return _mm_set_pd(0, a); } // Sets the 8 signed 16-bit integer values to w. // // r0 := w // r1 := w // ... // r7 := w // // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx FORCE_INLINE __m128i _mm_set1_epi16(short w) { return vreinterpretq_m128i_s16(vdupq_n_s16(w)); } // Sets the 4 signed 32-bit integer values to i. // // r0 := i // r1 := i // r2 := i // r3 := I // // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx FORCE_INLINE __m128i _mm_set1_epi32(int _i) { return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); } // Sets the 2 signed 64-bit integer values to i. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) { return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i)); } // Sets the 2 signed 64-bit integer values to i. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) { return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); } // Sets the 16 signed 8-bit integer values to b. // // r0 := b // r1 := b // ... // r15 := b // // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx FORCE_INLINE __m128i _mm_set1_epi8(signed char w) { return vreinterpretq_m128i_s8(vdupq_n_s8(w)); } // Broadcast double-precision (64-bit) floating-point value a to all elements of // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd FORCE_INLINE __m128d _mm_set1_pd(double d) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vdupq_n_f64(d)); #else return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); #endif } // Sets the 8 signed 16-bit integer values in reverse order. // // Return Value // r0 := w0 // r1 := w1 // ... // r7 := w7 FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) { int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); } // Sets the 4 signed 32-bit integer values in reverse order // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) { int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; return vreinterpretq_m128i_s32(vld1q_s32(data)); } // Set packed 64-bit integers in dst with the supplied values in reverse order. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) { return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); } // Sets the 16 signed 8-bit integer values in reverse order. // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15) { int8_t ALIGN_STRUCT(16) data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } // Set packed double-precision (64-bit) floating-point elements in dst with the // supplied values in reverse order. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) { return _mm_set_pd(e0, e1); } // Return vector of type __m128d with all elements set to zero. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd FORCE_INLINE __m128d _mm_setzero_pd(void) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vdupq_n_f64(0)); #else return vreinterpretq_m128d_f32(vdupq_n_f32(0)); #endif } // Sets the 128-bit value to zero // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx FORCE_INLINE __m128i _mm_setzero_si128(void) { return vreinterpretq_m128i_s32(vdupq_n_s32(0)); } // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, // __constrange(0,255) int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_epi32(a, imm) \ __extension__({ \ int32x4_t _input = vreinterpretq_s32_m128i(a); \ int32x4_t _shuf = __builtin_shufflevector( \ _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ vreinterpretq_m128i_s32(_shuf); \ }) #else // generic #define _mm_shuffle_epi32(a, imm) \ __extension__({ \ __m128i ret; \ switch (imm) { \ case _MM_SHUFFLE(1, 0, 3, 2): \ ret = _mm_shuffle_epi_1032((a)); \ break; \ case _MM_SHUFFLE(2, 3, 0, 1): \ ret = _mm_shuffle_epi_2301((a)); \ break; \ case _MM_SHUFFLE(0, 3, 2, 1): \ ret = _mm_shuffle_epi_0321((a)); \ break; \ case _MM_SHUFFLE(2, 1, 0, 3): \ ret = _mm_shuffle_epi_2103((a)); \ break; \ case _MM_SHUFFLE(1, 0, 1, 0): \ ret = _mm_shuffle_epi_1010((a)); \ break; \ case _MM_SHUFFLE(1, 0, 0, 1): \ ret = _mm_shuffle_epi_1001((a)); \ break; \ case _MM_SHUFFLE(0, 1, 0, 1): \ ret = _mm_shuffle_epi_0101((a)); \ break; \ case _MM_SHUFFLE(2, 2, 1, 1): \ ret = _mm_shuffle_epi_2211((a)); \ break; \ case _MM_SHUFFLE(0, 1, 2, 2): \ ret = _mm_shuffle_epi_0122((a)); \ break; \ case _MM_SHUFFLE(3, 3, 3, 2): \ ret = _mm_shuffle_epi_3332((a)); \ break; \ case _MM_SHUFFLE(0, 0, 0, 0): \ ret = _mm_shuffle_epi32_splat((a), 0); \ break; \ case _MM_SHUFFLE(1, 1, 1, 1): \ ret = _mm_shuffle_epi32_splat((a), 1); \ break; \ case _MM_SHUFFLE(2, 2, 2, 2): \ ret = _mm_shuffle_epi32_splat((a), 2); \ break; \ case _MM_SHUFFLE(3, 3, 3, 3): \ ret = _mm_shuffle_epi32_splat((a), 3); \ break; \ default: \ ret = _mm_shuffle_epi32_default((a), (imm)); \ break; \ } \ ret; \ }) #endif // Shuffle double-precision (64-bit) floating-point elements using the control // in imm8, and store the results in dst. // // dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] // dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_pd(a, b, imm8) \ vreinterpretq_m128d_s64(__builtin_shufflevector( \ vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ ((imm8 & 0x2) >> 1) + 2)) #else #define _mm_shuffle_pd(a, b, imm8) \ _mm_castsi128_pd(_mm_set_epi64x( \ vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) #endif // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, // __constrange(0,255) int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shufflehi_epi16(a, imm) \ __extension__({ \ int16x8_t _input = vreinterpretq_s16_m128i(a); \ int16x8_t _shuf = __builtin_shufflevector( \ _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ (((imm) >> 6) & 0x3) + 4); \ vreinterpretq_m128i_s16(_shuf); \ }) #else // generic #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) #endif // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, // __constrange(0,255) int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shufflelo_epi16(a, imm) \ __extension__({ \ int16x8_t _input = vreinterpretq_s16_m128i(a); \ int16x8_t _shuf = __builtin_shufflevector( \ _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ vreinterpretq_m128i_s16(_shuf); \ }) #else // generic #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) #endif // Shift packed 16-bit integers in a left by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF count[63:0] > 15 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~15)) return _mm_setzero_si128(); int16x8_t vc = vdupq_n_s16((int16_t) c); return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); } // Shift packed 32-bit integers in a left by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF count[63:0] > 31 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~31)) return _mm_setzero_si128(); int32x4_t vc = vdupq_n_s32((int32_t) c); return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); } // Shift packed 64-bit integers in a left by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 1 // i := j*64 // IF count[63:0] > 63 // dst[i+63:i] := 0 // ELSE // dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~63)) return _mm_setzero_si128(); int64x2_t vc = vdupq_n_s64((int64_t) c); return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); } // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF imm8[7:0] > 15 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16 FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~15)) return _mm_setzero_si128(); return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16((int16_t) imm))); } // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF imm8[7:0] > 31 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~31)) return _mm_setzero_si128(); return vreinterpretq_m128i_s32( vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); } // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 1 // i := j*64 // IF imm8[7:0] > 63 // dst[i+63:i] := 0 // ELSE // dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~63)) return _mm_setzero_si128(); return vreinterpretq_m128i_s64( vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); } // Shift a left by imm8 bytes while shifting in zeros, and store the results in // dst. // // tmp := imm8[7:0] // IF tmp > 15 // tmp := 16 // FI // dst[127:0] := a[127:0] << (tmp*8) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128 FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~15)) return _mm_setzero_si128(); uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)}; return vreinterpretq_m128i_u8( vld1q_u8(((uint8_t const *) tmp) + (16 - imm))); } // Compute the square root of packed double-precision (64-bit) floating-point // elements in a, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); #else double a0 = SQRT(((double *) &a)[0]); double a1 = SQRT(((double *) &a)[1]); return _mm_set_pd(a1, a0); #endif } // Compute the square root of the lower double-precision (64-bit) floating-point // element in b, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_sqrt_pd(b)); #else return _mm_set_pd(((double *) &a)[1], SQRT(((double *) &b)[0])); #endif } // Shift packed 16-bit integers in a right by count while shifting in sign bits, // and store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF count[63:0] > 15 // dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) // ELSE // dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) { int64_t c = (int64_t) vget_low_s64((int64x2_t) count); if (_sse2neon_unlikely(c & ~15)) return _mm_cmplt_epi16(a, _mm_setzero_si128()); return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t) (-c)))); } // Shift packed 32-bit integers in a right by count while shifting in sign bits, // and store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF count[63:0] > 31 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) // ELSE // dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) { int64_t c = (int64_t) vget_low_s64((int64x2_t) count); if (_sse2neon_unlikely(c & ~31)) return _mm_cmplt_epi32(a, _mm_setzero_si128()); return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32((int16_t)(-c)))); } // Shift packed 16-bit integers in a right by imm8 while shifting in sign // bits, and store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF imm8[7:0] > 15 // dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) // ELSE // dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) { const int count = (imm & ~15) ? 15 : imm; return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t)(-count))); } // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, // and store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF imm8[7:0] > 31 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) // ELSE // dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) #define _mm_srai_epi32(a, imm) \ __extension__({ \ __m128i ret; \ if (_sse2neon_unlikely((imm) == 0)) { \ ret = a; \ } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \ ret = vreinterpretq_m128i_s32( \ vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ } else { \ ret = vreinterpretq_m128i_s32( \ vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ } \ ret; \ }) // Shift packed 16-bit integers in a right by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF count[63:0] > 15 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~15)) return _mm_setzero_si128(); int16x8_t vc = vdupq_n_s16(-(int16_t) c); return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); } // Shift packed 32-bit integers in a right by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF count[63:0] > 31 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~31)) return _mm_setzero_si128(); int32x4_t vc = vdupq_n_s32(-(int32_t) c); return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); } // Shift packed 64-bit integers in a right by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 1 // i := j*64 // IF count[63:0] > 63 // dst[i+63:i] := 0 // ELSE // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~63)) return _mm_setzero_si128(); int64x2_t vc = vdupq_n_s64(-(int64_t) c); return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); } // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF imm8[7:0] > 15 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 #define _mm_srli_epi16(a, imm) \ __extension__({ \ __m128i ret; \ if (_sse2neon_unlikely((imm) & ~15)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u16( \ vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \ } \ ret; \ }) // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF imm8[7:0] > 31 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) #define _mm_srli_epi32(a, imm) \ __extension__({ \ __m128i ret; \ if (_sse2neon_unlikely((imm) & ~31)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u32( \ vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \ } \ ret; \ }) // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 1 // i := j*64 // IF imm8[7:0] > 63 // dst[i+63:i] := 0 // ELSE // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 #define _mm_srli_epi64(a, imm) \ __extension__({ \ __m128i ret; \ if (_sse2neon_unlikely((imm) & ~63)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u64( \ vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \ } \ ret; \ }) // Shift a right by imm8 bytes while shifting in zeros, and store the results in // dst. // // tmp := imm8[7:0] // IF tmp > 15 // tmp := 16 // FI // dst[127:0] := a[127:0] >> (tmp*8) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128 FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~15)) return _mm_setzero_si128(); uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)}; return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm)); } // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary // or a general-protection exception may be generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); #else vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); #endif } // Store the lower double-precision (64-bit) floating-point element from a into // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) { #if defined(__aarch64__) float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); #else float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); #endif } // Store the lower double-precision (64-bit) floating-point element from a into // memory. mem_addr does not need to be aligned on any particular boundary. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); #else vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); #endif } // Stores four 32-bit integer values as (as a __m128i value) at the address p. // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) { vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); } // Store the lower double-precision (64-bit) floating-point element from a into // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd #define _mm_store1_pd _mm_store_pd1 // Store the upper double-precision (64-bit) floating-point element from a into // memory. // // MEM[mem_addr+63:mem_addr] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); #else vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); #endif } // Reads the lower 64 bits of b and stores them into the lower 64 bits of a. // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) { uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); } // Store the lower double-precision (64-bit) floating-point element from a into // memory. // // MEM[mem_addr+63:mem_addr] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); #else vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); #endif } // Store 2 double-precision (64-bit) floating-point elements from a into memory // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // MEM[mem_addr+63:mem_addr] := a[127:64] // MEM[mem_addr+127:mem_addr+64] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) { float32x4_t f = vreinterpretq_f32_m128d(a); _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); } // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory. mem_addr does not need to be aligned on any // particular boundary. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) { _mm_store_pd(mem_addr, a); } // Stores 128-bits of integer data a at the address p. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) { vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); } // Stores 32-bits of integer data a at the address p. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32 FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) { vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0); } // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory using a non-temporal memory hint. mem_addr must // be aligned on a 16-byte boundary or a general-protection exception may be // generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, (float32x4_t *) p); #elif defined(__aarch64__) vst1q_f64(p, vreinterpretq_f64_m128d(a)); #else vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a)); #endif } // Stores the data in a to the address p without polluting the caches. If the // cache line containing address p is already in the cache, the cache will be // updated. // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, p); #else vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); #endif } // Store 32-bit integer a into memory using a non-temporal hint to minimize // cache pollution. If the cache line containing address mem_addr is already in // the cache, the cache will be updated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32 FORCE_INLINE void _mm_stream_si32(int *p, int a) { vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0); } // Store 64-bit integer a into memory using a non-temporal hint to minimize // cache pollution. If the cache line containing address mem_addr is already in // the cache, the cache will be updated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64 FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a) { vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a)); } // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and // store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or // unsigned 32-bit integers of a. // // r0 := a0 - b0 // r1 := a1 - b1 // r2 := a2 - b2 // r3 := a3 - b3 // // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, // and store the results in dst. // r0 := a0 - b0 // r1 := a1 - b1 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) { return vreinterpretq_m128i_s64( vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); } // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and // store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Subtract packed double-precision (64-bit) floating-point elements in b from // packed double-precision (64-bit) floating-point elements in a, and store the // results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := a[i+63:i] - b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] - db[0]; c[1] = da[1] - db[1]; return vld1q_f32((float32_t *) c); #endif } // Subtract the lower double-precision (64-bit) floating-point element in b from // the lower double-precision (64-bit) floating-point element in a, store the // result in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_sub_pd(a, b)); } // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. // // dst[63:0] := a[63:0] - b[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) { return vreinterpret_m64_s64( vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); } // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers // of a and saturates. // // r0 := SignedSaturate(a0 - b0) // r1 := SignedSaturate(a1 - b1) // ... // r7 := SignedSaturate(a7 - b7) // // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers // of a and saturates. // // r0 := SignedSaturate(a0 - b0) // r1 := SignedSaturate(a1 - b1) // ... // r15 := SignedSaturate(a15 - b15) // // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit // integers of a and saturates.. // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit // integers of a and saturates. // // r0 := UnsignedSaturate(a0 - b0) // r1 := UnsignedSaturate(a1 - b1) // ... // r15 := UnsignedSaturate(a15 - b15) // // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } #define _mm_ucomieq_sd _mm_comieq_sd #define _mm_ucomige_sd _mm_comige_sd #define _mm_ucomigt_sd _mm_comigt_sd #define _mm_ucomile_sd _mm_comile_sd #define _mm_ucomilt_sd _mm_comilt_sd #define _mm_ucomineq_sd _mm_comineq_sd // Return vector of type __m128d with undefined elements. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd FORCE_INLINE __m128d _mm_undefined_pd(void) { #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic push //#pragma GCC diagnostic ignored "-Wuninitialized" #endif // __m128d a; __m128d a= (__m128d) ZERO_SSE2NEON(); BUG_SSE2NEON; return a; #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic pop #endif } // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the // upper 4 signed or unsigned 16-bit integers in b. // // r0 := a4 // r1 := b4 // r2 := a5 // r3 := b5 // r4 := a6 // r5 := b6 // r6 := a7 // r7 := b7 // // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s16( vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); int16x4x2_t result = vzip_s16(a1, b1); return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); #endif } // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the // upper 2 signed or unsigned 32-bit integers in b. // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s32( vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); int32x2x2_t result = vzip_s32(a1, b1); return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); #endif } // Interleaves the upper signed or unsigned 64-bit integer in a with the // upper signed or unsigned 64-bit integer in b. // // r0 := a1 // r1 := b1 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); } // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper // 8 signed or unsigned 8-bit integers in b. // // r0 := a8 // r1 := b8 // r2 := a9 // r3 := b9 // ... // r14 := a15 // r15 := b15 // // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s8( vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); int8x8x2_t result = vzip_s8(a1, b1); return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); #endif } // Unpack and interleave double-precision (64-bit) floating-point elements from // the high half of a and b, and store the results in dst. // // DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { // dst[63:0] := src1[127:64] // dst[127:64] := src2[127:64] // RETURN dst[127:0] // } // dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else return vreinterpretq_m128d_s64( vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), vget_high_s64(vreinterpretq_s64_m128d(b)))); #endif } // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the // lower 4 signed or unsigned 16-bit integers in b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // r4 := a2 // r5 := b2 // r6 := a3 // r7 := b3 // // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s16( vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); int16x4x2_t result = vzip_s16(a1, b1); return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); #endif } // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the // lower 2 signed or unsigned 32 - bit integers in b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s32( vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); int32x2x2_t result = vzip_s32(a1, b1); return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); #endif } FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); } // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower // 8 signed or unsigned 8-bit integers in b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // ... // r14 := a7 // r15 := b7 // // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s8( vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); int8x8x2_t result = vzip_s8(a1, b1); return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); #endif } // Unpack and interleave double-precision (64-bit) floating-point elements from // the low half of a and b, and store the results in dst. // // DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { // dst[63:0] := src1[63:0] // dst[127:64] := src2[63:0] // RETURN dst[127:0] // } // dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else return vreinterpretq_m128d_s64( vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), vget_low_s64(vreinterpretq_s64_m128d(b)))); #endif } // Compute the bitwise XOR of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := a[i+63:i] XOR b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } /* SSE3 */ // Alternatively add and subtract packed double-precision (64-bit) // floating-point elements in a to/from packed elements in b, and store the // results in dst. // // FOR j := 0 to 1 // i := j*64 // IF ((j & 1) == 0) // dst[i+63:i] := a[i+63:i] - b[i+63:i] // ELSE // dst[i+63:i] := a[i+63:i] + b[i+63:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) { static const __m128d mask = _mm_set_pd(1.0f, -1.0f); #if defined(__aarch64__) return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(mask))); #else return _mm_add_pd(_mm_mul_pd(b, mask), a); #endif } // Alternatively add and subtract packed single-precision (32-bit) // floating-point elements in a to/from packed elements in b, and store the // results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) { static const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f); #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */ return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(mask), vreinterpretq_f32_m128(b))); #else return _mm_add_ps(_mm_mul_ps(b, mask), a); #endif } // Horizontally add adjacent pairs of double-precision (64-bit) floating-point // elements in a and b, and pack the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[] = {da[0] + da[1], db[0] + db[1]}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } // Computes pairwise add of each argument as single-precision, floating-point // values a and b. // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) { #if defined(__aarch64__) return vreinterpretq_m128_f32( vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32( vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); #endif } // Horizontally subtract adjacent pairs of double-precision (64-bit) // floating-point elements in a and b, and pack the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) { #if defined(__aarch64__) float64x2_t a = vreinterpretq_f64_m128d(_a); float64x2_t b = vreinterpretq_f64_m128d(_b); return vreinterpretq_m128d_f64( vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b))); #else double *da = (double *) &_a; double *db = (double *) &_b; double c[] = {da[0] - da[1], db[0] - db[1]}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } // Horizontally subtract adjacent pairs of single-precision (32-bit) // floating-point elements in a and b, and pack the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) { float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); #if defined(__aarch64__) return vreinterpretq_m128_f32( vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b))); #else float32x4x2_t c = vuzpq_f32(a, b); return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); #endif } // Load 128-bits of integer data from unaligned memory into dst. This intrinsic // may perform better than _mm_loadu_si128 when the data crosses a cache line // boundary. // // dst[127:0] := MEM[mem_addr+127:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 #define _mm_lddqu_si128 _mm_loadu_si128 // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd #define _mm_loaddup_pd _mm_load1_pd // Duplicate the low double-precision (64-bit) floating-point element from a, // and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); #else return vreinterpretq_m128d_u64( vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); #endif } // Duplicate odd-indexed single-precision (32-bit) floating-point elements // from a, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) { #if __has_builtin(__builtin_shufflevector) return vreinterpretq_m128_f32(__builtin_shufflevector( vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); #else float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; return vreinterpretq_m128_f32(vld1q_f32(data)); #endif } // Duplicate even-indexed single-precision (32-bit) floating-point elements // from a, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) { #if __has_builtin(__builtin_shufflevector) return vreinterpretq_m128_f32(__builtin_shufflevector( vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); #else float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; return vreinterpretq_m128_f32(vld1q_f32(data)); #endif } /* SSSE3 */ // Compute the absolute value of packed signed 16-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 7 // i := j*16 // dst[i+15:i] := ABS(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) { return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); } // Compute the absolute value of packed signed 32-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 3 // i := j*32 // dst[i+31:i] := ABS(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) { return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); } // Compute the absolute value of packed signed 8-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 15 // i := j*8 // dst[i+7:i] := ABS(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) { return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); } // Compute the absolute value of packed signed 16-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := ABS(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) { return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); } // Compute the absolute value of packed signed 32-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 1 // i := j*32 // dst[i+31:i] := ABS(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) { return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); } // Compute the absolute value of packed signed 8-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := ABS(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) { return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); } // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift // the result right by imm8 bytes, and store the low 16 bytes in dst. // // tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) // dst[127:0] := tmp[127:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8 FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm) { if (_sse2neon_unlikely(imm & ~31)) return _mm_setzero_si128(); int idx; uint8x16_t tmp[2]; if (imm >= 16) { idx = imm - 16; tmp[0] = vreinterpretq_u8_m128i(a); tmp[1] = vdupq_n_u8(0); } else { idx = imm; tmp[0] = vreinterpretq_u8_m128i(b); tmp[1] = vreinterpretq_u8_m128i(a); } return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx)); } // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift // the result right by imm8 bytes, and store the low 8 bytes in dst. // // tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) // dst[63:0] := tmp[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8 #define _mm_alignr_pi8(a, b, imm) \ __extension__({ \ __m64 ret; \ if (_sse2neon_unlikely((imm) >= 16)) { \ ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ } else { \ uint8x8_t tmp_low, tmp_high; \ if ((imm) >= 8) { \ const int idx = (imm) -8; \ tmp_low = vreinterpret_u8_m64(a); \ tmp_high = vdup_n_u8(0); \ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ } else { \ const int idx = (imm); \ tmp_low = vreinterpret_u8_m64(b); \ tmp_high = vreinterpret_u8_m64(a); \ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ } \ } \ ret; \ }) // Computes pairwise add of each argument as a 16-bit signed or unsigned integer // values a and b. FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); #if defined(__aarch64__) return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); #else return vreinterpretq_m128i_s16( vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); #endif } // Computes pairwise add of each argument as a 32-bit signed or unsigned integer // values a and b. FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); return vreinterpretq_m128i_s32( vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); } // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the // signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the // signed 32-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) { return vreinterpret_m64_s32( vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); } // Computes saturated pairwise sub of each argument as a 16-bit signed // integer values a and b. FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) { #if defined(__aarch64__) int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); return vreinterpretq_s64_s16( vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); // Interleave using vshrn/vmovn // [a0|a2|a4|a6|b0|b2|b4|b6] // [a1|a3|a5|a7|b1|b3|b5|b7] int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); // Saturated add return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); #endif } // Horizontally add adjacent pairs of signed 16-bit integers in a and b using // saturation, and pack the signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16 FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); #if defined(__aarch64__) return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t res = vuzp_s16(a, b); return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1])); #endif } // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack // the signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); #if defined(__aarch64__) return vreinterpretq_m128i_s16( vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int16x8x2_t c = vuzpq_s16(a, b); return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack // the signed 32-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); #if defined(__aarch64__) return vreinterpretq_m128i_s32( vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b))); #else int32x4x2_t c = vuzpq_s32(a, b); return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack // the signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16 FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); #if defined(__aarch64__) return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t c = vuzp_s16(a, b); return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack // the signed 32-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32 FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) { int32x2_t a = vreinterpret_s32_m64(_a); int32x2_t b = vreinterpret_s32_m64(_b); #if defined(__aarch64__) return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b))); #else int32x2x2_t c = vuzp_s32(a, b); return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1])); #endif } // Computes saturated pairwise difference of each argument as a 16-bit signed // integer values a and b. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); #if defined(__aarch64__) return vreinterpretq_m128i_s16( vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int16x8x2_t c = vuzpq_s16(a, b); return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b // using saturation, and pack the signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16 FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); #if defined(__aarch64__) return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t c = vuzp_s16(a, b); return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1])); #endif } // Vertically multiply each unsigned 8-bit integer from a with the corresponding // signed 8-bit integer from b, producing intermediate signed 16-bit integers. // Horizontally add adjacent pairs of intermediate signed 16-bit integers, // and pack the saturated results in dst. // // FOR j := 0 to 7 // i := j*16 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + // a[i+7:i]*b[i+7:i] ) // ENDFOR FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) { #if defined(__aarch64__) uint8x16_t a = vreinterpretq_u8_m128i(_a); int8x16_t b = vreinterpretq_s8_m128i(_b); int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b))); int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b))); return vreinterpretq_m128i_s16( vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); #else // This would be much simpler if x86 would choose to zero extend OR sign // extend, not both. This could probably be optimized better. uint16x8_t a = vreinterpretq_u16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); // Zero extend a int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); // Sign extend by shifting left then shifting right. int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); int16x8_t b_odd = vshrq_n_s16(b, 8); // multiply int16x8_t prod1 = vmulq_s16(a_even, b_even); int16x8_t prod2 = vmulq_s16(a_odd, b_odd); // saturated add return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); #endif } // Vertically multiply each unsigned 8-bit integer from a with the corresponding // signed 8-bit integer from b, producing intermediate signed 16-bit integers. // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and // pack the saturated results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16 FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b) { uint16x4_t a = vreinterpret_u16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); // Zero extend a int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8)); int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff))); // Sign extend by shifting left then shifting right. int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8); int16x4_t b_odd = vshr_n_s16(b, 8); // multiply int16x4_t prod1 = vmul_s16(a_even, b_even); int16x4_t prod2 = vmul_s16(a_odd, b_odd); // saturated add return vreinterpret_m64_s16(vqadd_s16(prod1, prod2)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate // signed 32-bit integers. Shift right by 15 bits while rounding up, and store // the packed 16-bit integers in dst. // // r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) // r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) // r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) // ... // r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) { // Has issues due to saturation // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); // Multiply int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), vget_low_s16(vreinterpretq_s16_m128i(b))); int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), vget_high_s16(vreinterpretq_s16_m128i(b))); // Rounding narrowing shift right // narrow = (int16_t)((mul + 16384) >> 15); int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); // Join together return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate // signed 32-bit integers. Truncate each intermediate integer to the 18 most // significant bits, round by adding 1, and store bits [16:1] to dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16 FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) { int32x4_t mul_extend = vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b))); // Rounding narrowing shift right return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15)); } // Shuffle packed 8-bit integers in a according to shuffle control mask in the // corresponding 8-bit element of b, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits #if defined(__aarch64__) return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); #elif defined(__GNUC__) int8x16_t ret; // %e and %f represent the even and odd D registers // respectively. __asm__ __volatile__( "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" : [ret] "=&w"(ret) : [tbl] "w"(tbl), [idx] "w"(idx_masked)); return vreinterpretq_m128i_s8(ret); #else // use this line if testing on aarch64 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; return vreinterpretq_m128i_s8( vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), vtbl2_s8(a_split, vget_high_u8(idx_masked)))); #endif } // Shuffle packed 8-bit integers in a according to shuffle control mask in the // corresponding 8-bit element of b, and store the results in dst. // // FOR j := 0 to 7 // i := j*8 // IF b[i+7] == 1 // dst[i+7:i] := 0 // ELSE // index[2:0] := b[i+2:i] // dst[i+7:i] := a[index*8+7:index*8] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8 FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) { const int8x8_t controlMask = vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t)(0x1 << 7 | 0x07))); int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask); return vreinterpret_m64_s8(res); } // Negate packed 16-bit integers in a when the corresponding signed // 16-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // // for i in 0..7 // if b[i] < 0 // r[i] := -a[i] // else if b[i] == 0 // r[i] := 0 // else // r[i] := a[i] // fi // done FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFF : 0 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); #else int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); #endif // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative // 'a') based on ltMask int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); // res = masked & (~zeroMask) int16x8_t res = vbicq_s16(masked, zeroMask); return vreinterpretq_m128i_s16(res); } // Negate packed 32-bit integers in a when the corresponding signed // 32-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // // for i in 0..3 // if b[i] < 0 // r[i] := -a[i] // else if b[i] == 0 // r[i] := 0 // else // r[i] := a[i] // fi // done FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFFFFFF : 0 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 #if defined(__aarch64__) int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); #else int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); #endif // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative // 'a') based on ltMask int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); // res = masked & (~zeroMask) int32x4_t res = vbicq_s32(masked, zeroMask); return vreinterpretq_m128i_s32(res); } // Negate packed 8-bit integers in a when the corresponding signed // 8-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // // for i in 0..15 // if b[i] < 0 // r[i] := -a[i] // else if b[i] == 0 // r[i] := 0 // else // r[i] := a[i] // fi // done FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) { int8x16_t a = vreinterpretq_s8_m128i(_a); int8x16_t b = vreinterpretq_s8_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFF : 0 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 #if defined(__aarch64__) int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); #else int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); #endif // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a') // based on ltMask int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); // res = masked & (~zeroMask) int8x16_t res = vbicq_s8(masked, zeroMask); return vreinterpretq_m128i_s8(res); } // Negate packed 16-bit integers in a when the corresponding signed 16-bit // integer in b is negative, and store the results in dst. Element in dst are // zeroed out when the corresponding element in b is zero. // // FOR j := 0 to 3 // i := j*16 // IF b[i+15:i] < 0 // dst[i+15:i] := -(a[i+15:i]) // ELSE IF b[i+15:i] == 0 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := a[i+15:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFF : 0 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); #else int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); #endif // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a') // based on ltMask int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); // res = masked & (~zeroMask) int16x4_t res = vbic_s16(masked, zeroMask); return vreinterpret_m64_s16(res); } // Negate packed 32-bit integers in a when the corresponding signed 32-bit // integer in b is negative, and store the results in dst. Element in dst are // zeroed out when the corresponding element in b is zero. // // FOR j := 0 to 1 // i := j*32 // IF b[i+31:i] < 0 // dst[i+31:i] := -(a[i+31:i]) // ELSE IF b[i+31:i] == 0 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := a[i+31:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) { int32x2_t a = vreinterpret_s32_m64(_a); int32x2_t b = vreinterpret_s32_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFFFFFF : 0 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 #if defined(__aarch64__) int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); #else int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); #endif // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a') // based on ltMask int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); // res = masked & (~zeroMask) int32x2_t res = vbic_s32(masked, zeroMask); return vreinterpret_m64_s32(res); } // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer // in b is negative, and store the results in dst. Element in dst are zeroed out // when the corresponding element in b is zero. // // FOR j := 0 to 7 // i := j*8 // IF b[i+7:i] < 0 // dst[i+7:i] := -(a[i+7:i]) // ELSE IF b[i+7:i] == 0 // dst[i+7:i] := 0 // ELSE // dst[i+7:i] := a[i+7:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) { int8x8_t a = vreinterpret_s8_m64(_a); int8x8_t b = vreinterpret_s8_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFF : 0 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 #if defined(__aarch64__) int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); #else int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); #endif // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a') // based on ltMask int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); // res = masked & (~zeroMask) int8x8_t res = vbic_s8(masked, zeroMask); return vreinterpret_m64_s8(res); } /* SSE4.1 */ // Blend packed 16-bit integers from a and b using control mask imm8, and store // the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF imm8[j] // dst[i+15:i] := b[i+15:i] // ELSE // dst[i+15:i] := a[i+15:i] // FI // ENDFOR // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, // __constrange(0,255) int imm) #define _mm_blend_epi16(a, b, imm) \ __extension__({ \ const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \ uint16x8_t _mask_vec = vld1q_u16(_mask); \ uint16x8_t _a = vreinterpretq_u16_m128i(a); \ uint16x8_t _b = vreinterpretq_u16_m128i(b); \ vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ }) // Blend packed double-precision (64-bit) floating-point elements from a and b // using control mask imm8, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd #define _mm_blend_pd(a, b, imm) \ __extension__({ \ const uint64_t _mask[2] = { \ ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \ uint64x2_t _mask_vec = vld1q_u64(_mask); \ uint64x2_t _a = vreinterpretq_u64_m128d(a); \ uint64x2_t _b = vreinterpretq_u64_m128d(b); \ vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \ }) // Blend packed single-precision (32-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) { const uint32_t ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, ((imm8) & (1 << 1)) ? UINT32_MAX : 0, ((imm8) & (1 << 2)) ? UINT32_MAX : 0, ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; uint32x4_t mask = vld1q_u32(data); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); } // Blend packed 8-bit integers from a and b using mask, and store the results in // dst. // // FOR j := 0 to 15 // i := j*8 // IF mask[i+7] // dst[i+7:i] := b[i+7:i] // ELSE // dst[i+7:i] := a[i+7:i] // FI // ENDFOR FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) { // Use a signed shift right to create a mask with the sign bit uint8x16_t mask = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); uint8x16_t a = vreinterpretq_u8_m128i(_a); uint8x16_t b = vreinterpretq_u8_m128i(_b); return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); } // Blend packed double-precision (64-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) { uint64x2_t mask = vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); #if defined(__aarch64__) float64x2_t a = vreinterpretq_f64_m128d(_a); float64x2_t b = vreinterpretq_f64_m128d(_b); return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); #else uint64x2_t a = vreinterpretq_u64_m128d(_a); uint64x2_t b = vreinterpretq_u64_m128d(_b); return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); #endif } // Blend packed single-precision (32-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) { // Use a signed shift right to create a mask with the sign bit uint32x4_t mask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); } // Round the packed double-precision (64-bit) floating-point elements in a up // to an integer value, and store the results as packed double-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); #else double *f = (double *) &a; return _mm_set_pd(CEIL(f[1]), CEIL(f[0])); #endif } // Round the packed single-precision (32-bit) floating-point elements in a up to // an integer value, and store the results as packed single-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) { #if defined(__aarch64__) return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); #else float *f = (float *) &a; return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); #endif } // Round the lower double-precision (64-bit) floating-point element in b up to // an integer value, store the result as a double-precision floating-point // element in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_ceil_pd(b)); } // Round the lower single-precision (32-bit) floating-point element in b up to // an integer value, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. // // dst[31:0] := CEIL(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_ceil_ps(b)); } // Compare packed 64-bit integers in a and b for equality, and store the results // in dst FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_u64( vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); #else // ARMv7 lacks vceqq_u64 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); #endif } // Converts the four signed 16-bit integers in the lower 64 bits to four signed // 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) { return vreinterpretq_m128i_s32( vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); } // Converts the two signed 16-bit integers in the lower 32 bits two signed // 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) { int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_s64(s64x2); } // Converts the two signed 32-bit integers in the lower 64 bits to two signed // 64-bit integers. FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) { return vreinterpretq_m128i_s64( vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); } // Converts the four unsigned 8-bit integers in the lower 16 bits to four // unsigned 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ return vreinterpretq_m128i_s16(s16x8); } // Converts the four unsigned 8-bit integers in the lower 32 bits to four // unsigned 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ return vreinterpretq_m128i_s32(s32x4); } // Converts the two signed 8-bit integers in the lower 32 bits to four // signed 64-bit integers. FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_s64(s64x2); } // Converts the four unsigned 16-bit integers in the lower 64 bits to four // unsigned 32-bit integers. FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) { return vreinterpretq_m128i_u32( vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); } // Converts the two unsigned 16-bit integers in the lower 32 bits to two // unsigned 64-bit integers. FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) { uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_u64(u64x2); } // Converts the two unsigned 32-bit integers in the lower 64 bits to two // unsigned 64-bit integers. FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) { return vreinterpretq_m128i_u64( vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); } // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, // and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */ return vreinterpretq_m128i_u16(u16x8); } // Converts the four unsigned 8-bit integers in the lower 32 bits to four // unsigned 32-bit integers. // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ return vreinterpretq_m128i_u32(u32x4); } // Converts the two unsigned 8-bit integers in the lower 16 bits to two // unsigned 64-bit integers. FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_u64(u64x2); } // Conditionally multiply the packed double-precision (64-bit) floating-point // elements in a and b using the high 4 bits in imm8, sum the four products, and // conditionally store the sum in dst using the low 4 bits of imm8. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) { // Generate mask value from constant immediate bit value const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0; const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0; #if !SSE2NEON_PRECISE_DP const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0; const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0; #endif // Conditional multiplication #if !SSE2NEON_PRECISE_DP __m128d mul = _mm_mul_pd(a, b); const __m128d mulMask = _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask)); __m128d tmp = _mm_and_pd(mul, mulMask); #else #if defined(__aarch64__) double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) * vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0) : 0; double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) * vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1) : 0; #else double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0; double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0; #endif __m128d tmp = _mm_set_pd(d1, d0); #endif // Sum the products #if defined(__aarch64__) double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); #else double sum = *((double *) &tmp) + *(((double *) &tmp) + 1); #endif // Conditionally store the sum const __m128d sumMask = _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask)); __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask); return res; } // Conditionally multiply the packed single-precision (32-bit) floating-point // elements in a and b using the high 4 bits in imm8, sum the four products, // and conditionally store the sum in dst using the low 4 bits of imm. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) { #if defined(__aarch64__) /* shortcuts */ if (imm == 0xFF) { return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); } if (imm == 0x7F) { float32x4_t m = _mm_mul_ps(a, b); m[3] = 0; return _mm_set1_ps(vaddvq_f32(m)); } #endif float s = 0, c = 0; float32x4_t f32a = vreinterpretq_f32_m128(a); float32x4_t f32b = vreinterpretq_f32_m128(b); /* To improve the accuracy of floating-point summation, Kahan algorithm * is used for each operation. */ if (imm & (1 << 4)) _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); if (imm & (1 << 5)) _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); if (imm & (1 << 6)) _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); if (imm & (1 << 7)) _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); s += c; float32x4_t res = { (imm & 0x1) ? s : 0, (imm & 0x2) ? s : 0, (imm & 0x4) ? s : 0, (imm & 0x8) ? s : 0, }; return vreinterpretq_m128_f32(res); } // Extracts the selected signed or unsigned 32-bit integer from a and zero // extends. // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) #define _mm_extract_epi32(a, imm) \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) // Extracts the selected signed or unsigned 64-bit integer from a and zero // extends. // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) #define _mm_extract_epi64(a, imm) \ vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) // Extracts the selected signed or unsigned 8-bit integer from a and zero // extends. // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) // Extracts the selected single-precision (32-bit) floating-point from a. // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) // Round the packed double-precision (64-bit) floating-point elements in a down // to an integer value, and store the results as packed double-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd FORCE_INLINE __m128d _mm_floor_pd(__m128d a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); #else double *f = (double *) &a; return _mm_set_pd(FLOOR(f[1]), FLOOR(f[0])); #endif } // Round the packed single-precision (32-bit) floating-point elements in a down // to an integer value, and store the results as packed single-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps FORCE_INLINE __m128 _mm_floor_ps(__m128 a) { #if defined(__aarch64__) return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); #else float *f = (float *) &a; return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); #endif } // Round the lower double-precision (64-bit) floating-point element in b down to // an integer value, store the result as a double-precision floating-point // element in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_floor_pd(b)); } // Round the lower single-precision (32-bit) floating-point element in b down to // an integer value, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. // // dst[31:0] := FLOOR(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_floor_ps(b)); } // Inserts the least significant 32 bits of b into the selected 32-bit integer // of a. // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, // __constrange(0,4) int imm) #define _mm_insert_epi32(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s32( \ vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ }) // Inserts the least significant 64 bits of b into the selected 64-bit integer // of a. // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, // __constrange(0,2) int imm) #define _mm_insert_epi64(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s64( \ vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ }) // Inserts the least significant 8 bits of b into the selected 8-bit integer // of a. // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, // __constrange(0,16) int imm) #define _mm_insert_epi8(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s8( \ vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ }) // Copy a to tmp, then insert a single-precision (32-bit) floating-point // element from b into tmp using the control in imm8. Store tmp to dst using // the mask in imm8 (elements are zeroed out when the corresponding bit is set). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps #define _mm_insert_ps(a, b, imm8) \ __extension__({ \ float32x4_t tmp1 = \ vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \ vreinterpretq_f32_m128(a), 0); \ float32x4_t tmp2 = \ vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \ ((imm8 >> 4) & 0x3)); \ const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \ uint32x4_t mask = vld1q_u32(data); \ float32x4_t all_zeros = vdupq_n_f32(0); \ \ vreinterpretq_m128_f32( \ vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \ }) // epi versions of min/max // Computes the pariwise maximums of the four signed 32-bit integer values of a // and b. // // A 128-bit parameter that can be defined with the following equations: // r0 := (a0 > b0) ? a0 : b0 // r1 := (a1 > b1) ? a1 : b1 // r2 := (a2 > b2) ? a2 : b2 // r3 := (a3 > b3) ? a3 : b3 // // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed signed 8-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed unsigned 16-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Compare packed unsigned 32-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); } // Computes the pariwise minima of the four signed 32-bit integer values of a // and b. // // A 128-bit parameter that can be defined with the following equations: // r0 := (a0 < b0) ? a0 : b0 // r1 := (a1 < b1) ? a1 : b1 // r2 := (a2 < b2) ? a2 : b2 // r3 := (a3 < b3) ? a3 : b3 // // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed signed 8-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed unsigned 16-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Compare packed unsigned 32-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); } // Horizontally compute the minimum amongst the packed unsigned 16-bit integers // in a, store the minimum and index in dst, and zero the remaining bits in dst. // // index[2:0] := 0 // min[15:0] := a[15:0] // FOR j := 0 to 7 // i := j*16 // IF a[i+15:i] < min[15:0] // index[2:0] := j // min[15:0] := a[i+15:i] // FI // ENDFOR // dst[15:0] := min[15:0] // dst[18:16] := index[2:0] // dst[127:19] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) { __m128i dst; uint16_t min, idx = 0; // Find the minimum value #if defined(__aarch64__) min = vminvq_u16(vreinterpretq_u16_m128i(a)); #else __m64 tmp; tmp = vreinterpret_m64_u16( vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), vget_high_u16(vreinterpretq_u16_m128i(a)))); tmp = vreinterpret_m64_u16( vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); tmp = vreinterpret_m64_u16( vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); #endif // Get the index of the minimum value int i; for (i = 0; i < 8; i++) { if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { idx = (uint16_t) i; break; } a = _mm_srli_si128(a, 2); } // Generate result dst = _mm_setzero_si128(); dst = vreinterpretq_m128i_u16( vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); dst = vreinterpretq_m128i_u16( vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); return dst; } // Compute the sum of absolute differences (SADs) of quadruplets of unsigned // 8-bit integers in a compared to those in b, and store the 16-bit results in // dst. Eight SADs are performed using one quadruplet from b and eight // quadruplets from a. One quadruplet is selected from b starting at on the // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit // integers selected from a starting at the offset specified in imm8. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8 FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) { uint8x16_t _a, _b; switch (imm & 0x4) { case 0: // do nothing _a = vreinterpretq_u8_m128i(a); break; case 4: _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(a), 1)); break; default: #if defined(__GNUC__) || defined(__clang__) __builtin_unreachable(); #endif break; } switch (imm & 0x3) { case 0: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0))); break; case 1: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1))); break; case 2: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2))); break; case 3: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3))); break; default: #if defined(__GNUC__) || defined(__clang__) __builtin_unreachable(); #endif break; } int16x8_t c04, c15, c26, c37; uint8x8_t low_b = vget_low_u8(_b); c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); _a = vextq_u8(_a, _a, 1); c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); _a = vextq_u8(_a, _a, 1); c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); _a = vextq_u8(_a, _a, 1); c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); #if defined(__aarch64__) // |0|4|2|6| c04 = vpaddq_s16(c04, c26); // |1|5|3|7| c15 = vpaddq_s16(c15, c37); int32x4_t trn1_c = vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); int32x4_t trn2_c = vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c), vreinterpretq_s16_s32(trn2_c))); #else int16x4_t c01, c23, c45, c67; c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15)); c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37)); c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15)); c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37)); return vreinterpretq_m128i_s16( vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67))); #endif } // Multiply the low signed 32-bit integers from each packed 64-bit element in // a and b, and store the signed 64-bit results in dst. // // r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 // r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) { // vmull_s32 upcasts instead of masking, so we downcast. int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); } // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or // unsigned 32-bit integers from b. // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit // integers and saturates. // // r0 := UnsignedSaturate(a0) // r1 := UnsignedSaturate(a1) // r2 := UnsignedSaturate(a2) // r3 := UnsignedSaturate(a3) // r4 := UnsignedSaturate(b0) // r5 := UnsignedSaturate(b1) // r6 := UnsignedSaturate(b2) // r7 := UnsignedSaturate(b3) FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), vqmovun_s32(vreinterpretq_s32_m128i(b)))); } // Round the packed double-precision (64-bit) floating-point elements in a using // the rounding parameter, and store the results as packed double-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) { #if defined(__aarch64__) switch (rounding) { case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a))); case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): return _mm_floor_pd(a); case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): return _mm_ceil_pd(a); case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a))); default: //_MM_FROUND_CUR_DIRECTION return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a))); } #else double *v_double = (double *) &a; if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { double res[2], tmp; for (int i = 0; i < 2; i++) { tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i]; double roundDown = FLOOR(tmp); // Round down value double roundUp = CEIL(tmp); // Round up value double diffDown = tmp - roundDown; double diffUp = roundUp - tmp; if (diffDown < diffUp) { /* If it's closer to the round down value, then use it */ res[i] = roundDown; } else if (diffDown > diffUp) { /* If it's closer to the round up value, then use it */ res[i] = roundUp; } else { /* If it's equidistant between round up and round down value, * pick the one which is an even number */ double half = roundDown / 2; if (half != FLOOR(half)) { /* If the round down value is odd, return the round up value */ res[i] = roundUp; } else { /* If the round up value is odd, return the round down value */ res[i] = roundDown; } } res[i] = (v_double[i] < 0) ? -res[i] : res[i]; } return _mm_set_pd(res[1], res[0]); } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { return _mm_floor_pd(a); } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { return _mm_ceil_pd(a); } return _mm_set_pd(v_double[1] > 0 ? FLOOR(v_double[1]) : CEIL(v_double[1]), v_double[0] > 0 ? FLOOR(v_double[0]) : CEIL(v_double[0])); #endif } // Round the packed single-precision (32-bit) floating-point elements in a using // the rounding parameter, and store the results as packed single-precision // floating-point elements in dst. // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) { #if defined(__aarch64__) switch (rounding) { case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): return _mm_floor_ps(a); case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): return _mm_ceil_ps(a); case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); default: //_MM_FROUND_CUR_DIRECTION return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); } #else float *v_float = (float *) &a; if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { uint32x4_t signmask = vdupq_n_u32(0x80000000); float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */ int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ int32x4_t r_trunc = vcvtq_s32_f32( vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ float32x4_t delta = vsubq_f32( vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ return vreinterpretq_m128_f32( vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal))); } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { return _mm_floor_ps(a); } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { return _mm_ceil_ps(a); } return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]), v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]), v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]), v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0])); #endif } // Round the lower double-precision (64-bit) floating-point element in b using // the rounding parameter, store the result as a double-precision floating-point // element in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) { return _mm_move_sd(a, _mm_round_pd(b, rounding)); } // Round the lower single-precision (32-bit) floating-point element in b using // the rounding parameter, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. Rounding is done according to the // rounding[3:0] parameter, which can be one of: // (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and // suppress exceptions // (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and // suppress exceptions // (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress // exceptions // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress // exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see // _MM_SET_ROUNDING_MODE // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) { return _mm_move_ss(a, _mm_round_ps(b, rounding)); } // Load 128-bits of integer data from memory into dst using a non-temporal // memory hint. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // dst[127:0] := MEM[mem_addr+127:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) { #if __has_builtin(__builtin_nontemporal_store) return __builtin_nontemporal_load(p); #else return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); #endif } // Compute the bitwise NOT of a and then AND with a 128-bit vector containing // all 1's, and return 1 if the result is zero, otherwise return 0. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones FORCE_INLINE int _mm_test_all_ones(__m128i a) { return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == ~(uint64_t) 0; } // Compute the bitwise AND of 128 bits (representing integer data) in a and // mask, and return 1 if the result is zero, otherwise return 0. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) { int64x2_t a_and_mask = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and // mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, // otherwise return 0. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) { uint64x2_t zf = vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a)); uint64x2_t cf = vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a)); uint64x2_t result = vandq_u64(zf, cf); return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return the CF value. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) { int64x2_t s64 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))), vreinterpretq_s64_m128i(b)); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, // otherwise return 0. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return the ZF value. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) { int64x2_t s64 = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } /* SSE4.2 */ // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers // in b for greater than. FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_u64( vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else return vreinterpretq_m128i_s64(vshrq_n_s64( vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)), 63)); #endif } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 16-bit integer v. // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc = _mm_crc32_u8(crc, v & 0xff); crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 32-bit integer v. // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc = _mm_crc32_u16(crc, v & 0xffff); crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 64-bit integer v. // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 8-bit integer v. // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc ^= v; for (int bit = 0; bit < 8; bit++) { if (crc & 1) crc = (crc >> 1) ^ UINT32_C(0x82f63b78); else crc = (crc >> 1); } #endif return crc; } /* AES */ #if !defined(__ARM_FEATURE_CRYPTO) /* clang-format off */ #define SSE2NEON_AES_DATA(w) \ { \ w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ w(0xb0), w(0x54), w(0xbb), w(0x16) \ } /* clang-format on */ /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ #define SSE2NEON_AES_H0(x) (x) static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0); #undef SSE2NEON_AES_H0 // In the absence of crypto extensions, implement aesenc using regular neon // intrinsics instead. See: // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52 // for more information Reproduced with permission of the author. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey) { #if defined(__aarch64__) static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb}; static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc}; uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(EncBlock); // shift rows w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); // sub bytes v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0); // mix columns w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b); w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); // add round key return vreinterpretq_m128i_u8(w) ^ RoundKey; #else /* ARMv7-A NEON implementation */ #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ (b0)) #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) #define SSE2NEON_AES_U0(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) #define SSE2NEON_AES_U1(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) #define SSE2NEON_AES_U2(p) \ SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) #define SSE2NEON_AES_U3(p) \ SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { SSE2NEON_AES_DATA(SSE2NEON_AES_U0), SSE2NEON_AES_DATA(SSE2NEON_AES_U1), SSE2NEON_AES_DATA(SSE2NEON_AES_U2), SSE2NEON_AES_DATA(SSE2NEON_AES_U3), }; #undef SSE2NEON_AES_B2W #undef SSE2NEON_AES_F2 #undef SSE2NEON_AES_F3 #undef SSE2NEON_AES_U0 #undef SSE2NEON_AES_U1 #undef SSE2NEON_AES_U2 #undef SSE2NEON_AES_U3 uint32_t x0 = _mm_cvtsi128_si32(EncBlock); uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55)); uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA)); uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF)); __m128i out = _mm_set_epi32( (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); return _mm_xor_si128(out, RoundKey); #endif } // Perform the last round of an AES encryption flow on data (state) in a using // the round key in RoundKey, and store the result in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { /* FIXME: optimized for NEON */ uint8_t v[4][4] = { {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, }; for (int i = 0; i < 16; i++) vreinterpretq_nth_u8_m128i(a, i) = v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i); return a; } // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. // This instruction generates a round key for AES encryption. See // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ // for details. // // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) { uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); for (int i = 0; i < 4; ++i) { ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]]; ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]]; } return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); } #undef SSE2NEON_AES_DATA #else /* __ARM_FEATURE_CRYPTO */ // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and // AESMC and then manually applying the real key as an xor operation. This // unfortunately means an additional xor op; the compiler should be able to // optimize this away for repeated calls however. See // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a // for more details. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^ vreinterpretq_u8_m128i(b)); } // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), RoundKey); } FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) { // AESE does ShiftRows and SubBytes on A uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); uint8x16_t dest = { // Undo ShiftRows step from AESE and extract X1 and X3 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) }; uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); } #endif /* Others */ // Perform a carry-less multiplication of two 64-bit integers, selected from a // and b according to imm8, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) { uint64x2_t a = vreinterpretq_u64_m128i(_a); uint64x2_t b = vreinterpretq_u64_m128i(_b); switch (imm & 0x11) { case 0x00: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); case 0x01: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); case 0x10: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); case 0x11: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); default: abort(); } } FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode() { union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF; } // Count the number of bits set to 1 in unsigned 32-bit integer a, and // return that count in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 FORCE_INLINE int _mm_popcnt_u32(unsigned int a) { #if defined(__aarch64__) #if __has_builtin(__builtin_popcount) return __builtin_popcount(a); #else return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); #endif #else uint32_t count = 0; uint8x8_t input_val, count8x8_val; uint16x4_t count16x4_val; uint32x2_t count32x2_val; input_val = vld1_u8((uint8_t *) &a); count8x8_val = vcnt_u8(input_val); count16x4_val = vpaddl_u8(count8x8_val); count32x2_val = vpaddl_u16(count16x4_val); vst1_u32(&count, count32x2_val); return count; #endif } // Count the number of bits set to 1 in unsigned 64-bit integer a, and // return that count in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) { #if defined(__aarch64__) #if __has_builtin(__builtin_popcountll) return __builtin_popcountll(a); #else return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); #endif #else uint64_t count = 0; uint8x8_t input_val, count8x8_val; uint16x4_t count16x4_val; uint32x2_t count32x2_val; uint64x1_t count64x1_val; input_val = vld1_u8((uint8_t *) &a); count8x8_val = vcnt_u8(input_val); count16x4_val = vpaddl_u8(count8x8_val); count32x2_val = vpaddl_u16(count16x4_val); count64x1_val = vpaddl_u32(count32x2_val); vst1_u64(&count, count64x1_val); return count; #endif } FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON; #if defined(__aarch64__) __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */ #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } #if defined(__GNUC__) || defined(__clang__) #pragma pop_macro("ALIGN_STRUCT") #pragma pop_macro("FORCE_INLINE") #endif #if defined(__GNUC__) && !defined(__clang__) #pragma GCC pop_options #endif #endif RandomFieldsUtils/src/solve_gpu.h0000644000176200001440000000066314227157055016622 0ustar liggesusers #ifndef RFutils_gpusolve #define RFutils_gpusolve 1 int cholGPU(bool copy, double *matrix, Uint size, double *B, Uint rhs_cols, double *LogDet, double *RESULT); void mgpuSolve(double *matrix, Uint individuals, double *vector); void gpu_relmat_custom(Uint*, double*, Uint, Uint); void gpu_relmat_cublas(Uint*, double*, Uint, Uint); // #define PADDIM 4L //#define BLOCKS 1024 #define THREADS_PER_BLOCK 1024 //2048 / 32 #endif RandomFieldsUtils/src/win_linux_aux.h0000644000176200001440000000222314227157055017502 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef WIN_LINUX_AUX_H #define WIN_LINUX_AUX_H 1 uint32_t cpuid_info(int Blatt, int Register);//MINGWCPUID, WINCPUID, LINUXCPUID #ifdef __cplusplus extern "C" { #endif void sleepMilli(int *milli); void sleepMicro(int *milli); void pid(int *i); void hostname(char **h, int *i); bool parallel(); #ifdef __cplusplus } #endif #endif /* WIN_LINUX_AUX_H */ RandomFieldsUtils/R/0000755000176200001440000000000014227157055014053 5ustar liggesusersRandomFieldsUtils/R/utils.R0000644000176200001440000002036714227157055015346 0ustar liggesusers## Authors ## Martin Schlather, schlather@math.uni-mannheim.de ## ## ## Copyright (C) 2015 -- 2021 Martin Schlather ## ## This program is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License ## as published by the Free Software Foundation; either version 3 ## of the License, or (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. sleep.milli <- function(n) { .C(C_sleepMilli, as.integer(n)) invisible(NULL) } sleep.micro <- function(n) { .C(C_sleepMicro, as.integer(n)) invisible(NULL) } hostname<-function(){.C(C_hostname, h=paste(seq(0,0,l=100), collapse=""), as.integer(100))$h} pid <- function() {.C(C_pid, i=integer(1))$i} LockFile <- function(file, printlevel=RFoptions()$basic$printlevel) { PL_ERRORS <- 6 lock.ext <- ".lock" LockFile <- paste(file, lock.ext, sep="") if (file.exists(LockFile)) { #2. if (printlevel>=PL_ERRORS ) cat("'",file,"' is locked.\n"); return(2); } PID <- pid(); write(file=LockFile,c(PID,hostname()),ncolumns=2,append=TRUE); #3.a. Pid <- matrix(scan(LockFile,what=character(0), quiet=TRUE),nrow=2) if ((sum(Pid[1,]==PID)!=1) || (sum(Pid[1,]>PID)>0)){ #3.b. if (printlevel>PL_ERRORS ) cat("Lock file of '", file, "' is knocked out.\n"); return(3); } return(0); } FileExists <- function(file, printlevel=RFoptions()$basic$printlevel) { ## for parallel simulation studies: the same data output file should not ## be created twice. So: ## 1. if file exists then assume another process has done the work already ## 2. if file.lock existss then assume another process is doing the work ## 3.a. otherwise create file.lock to show other processes that the process ## will do the work ## 3.b. check if another process has started with the same work at the same ## time it may happen that in case of simulatenous creation of file.lock ## no process will do the work...(then the lock file will rest.) PL_ERRORS <- 6 if (file.exists(file)) { #1. if (printlevel>=PL_ERRORS ) cat("'", file, "' already exists.\n"); return(1) } else { return(LockFile(file, printlevel=printlevel)) } } get.lscpu <- function(pattern) { x <- system(paste0("lscpu | egrep '", pattern, "'"), intern=TRUE) w <- base::options()$warn base::options(warn=-1) x <- Try(as.integer(sapply(strsplit(x, ":"), function(x) x[2]))) if (is(x, CLASS_TRYERROR)) return(NA) x <- x[is.finite(x)] base::options(warn = w) return(if (length(x) > 0) x[1] else NA) } WaitOthers <- function(file, i, cores=NULL, ideal.processes=ceiling(cores * 1.25), max.processes=ceiling(cores * 1.5), distance=5, time=5, path="./") { ## time in minutes if (length(cores)==0) cores <- cores() maxint <- .Machine$integer.max file0 <- paste(file, "wait", sep=".") wait.pattern <- paste0(path, "*.wait") repeat { files <- dir(pattern=wait.pattern) processes <- length(files) if (processes <= cores) break Is <- integer(processes) write(file=file0, i) for (f in 1:processes) { j <- Try(as.integer(read.table(files[f]))) Is[f] <- if (is(j, CLASS_TRYERROR) || length(j) != 1) maxint else j } Is <- Is[is.finite(Is)] if (sum(Is < maxint) <= max.processes && sum(Is <= i) <= ideal.processes && sum(Is < i - distance) <= cores) break write(file=file0, maxint) sleep.milli(time * 60000) } write(file=file0, i) } LockRemove <- function(file) { ## removes auxiliary files created by FileExists & WaitOthers for (lock.ext in c("lock", "wait")) { file0 <- paste(file, lock.ext, sep=".") if (file.exists(file0)) file.remove(file0) } } Print <- function(..., digits=6, empty.lines=2) { # OK ## ?"..1" # print(..1) # print(substitute(..1)) # print(missing(..100)) max.elements <- 99 l <- list(...) n <- as.character(match.call())[-1] cat(paste(rep("\n", empty.lines), collapse="")) for (i in 1:length(l)) { cat(n[i]) if (!is.list(l[[i]]) && is.vector(l[[i]])) { L <- length(l[[i]]) if (L==0) cat(" = ") else { cat(" [", L, "] = ", sep="") cat(if (is.numeric(l[[i]])) round(l[[i]][1:min(L , max.elements)], digits=digits)# else l[[i]][1:min(L , max.elements)]) # if (max.elements < L) cat(" ...") } } else { if (is.list(l[[i]])) { cat(" = ") str(l[[i]], digits.d=digits) # OK } else { cat(" =") if (length(l[[i]]) <= 100 && FALSE) { print(if (is.numeric(l[[i]])) round(l[[i]], digits=digits)# OK else l[[i]]) } else { if (length(l[[i]]) > 1 && !is.vector(l[[i]]) && !is.matrix(l[[i]]) && !is.array(l[[i]])) cat("\n") str(l[[i]]) # OK } } } cat("\n") } } cholx <- function(a) .Call(C_Chol, a) cholPosDef <- function() stop("please use 'cholx' instead of 'cholPosDef'.") solvePosDef <- function(a, b=NULL, logdeterminant=FALSE) { stop("please use 'solvex' instead of 'solvePosDef'.") } solvex <- function(a, b=NULL, logdeterminant=FALSE) { if (logdeterminant) { logdet <- double(1) res <- .Call(C_SolvePosDefR, a, b, logdet) return(list(inv=res, logdet=logdet)) } else { .Call(C_SolvePosDefR, a, b, double(0)) } } sortx <- function(x, from=1, to=length(x), decreasing=FALSE, na.last = NA) { n <- length(x) if (n <= 4000 || (to - from) < (0.35 + is.double(x) * 0.15) * n) { if (decreasing) { x <- -x if (!is.na(na.last)) na.last <- !na.last } ans <- .Call(C_sortX, x, as.integer(from), as.integer(to), as.logical(na.last)) return(if (decreasing) -ans else ans) } else { return(if (from==1 && to==n) sort(x, decreasing=decreasing, na.last=na.last) else sort(x, decreasing=decreasing, na.last=na.last)[from:to]) } } orderx <- function(x, from=1, to=length(x), decreasing=FALSE, na.last = NA) { # cat((to - from) * (0.35 + 0.14 * log(length(x)))^2, "", length(x), "\n") if ((to - from) * (0.35 + 0.14 * log(length(x)))^2 > length(x)) { #10^2:1, 10^3:1.5, 10^4:3 10^5:5 10^6:5, 10^7: 8, 10^8:10, # cat("old", from, to ,"\n"); ans <- order(x, decreasing=decreasing, na.last=na.last) return(if (from==1 && to==length(x)) ans else ans[from:to]) } if (decreasing) { x <- -x if (!is.na(na.last)) na.last <- !na.last } .Call(C_orderX, x, as.integer(from), as.integer(to), as.logical(na.last)) } scalarx <- function(x, y, mode=0) .Call(C_scalarR, x, y, as.integer(mode)) crossprodx <- function(x, y, mode=-1) .Call(C_crossprodX, x, if (missing(y)) x else y, as.integer(mode)) confirm <- function(x, y, ...) { e <- all.equal(x, y, ...) if (is.logical(e) && e) { cat("'", deparse(substitute(x)) , "' and '", deparse(substitute(y)), "' are the same.\n", sep="") } else { if (R.Version()$os=="linux-gnu") stop(e) else { message(x) cat("(under linux systems they are the same.)") return(FALSE) } } } chol2mv <- function(C, n) .Call(C_chol2mv, C, as.integer(n)) tcholRHS <- function(C, RHS) { if (!is.double(RHS)) storage.mode(RHS) <- "double" .Call(C_tcholRHS, C, RHS) } colMax <- function(x) .Call(C_colMaxs, x) rowMeansx <- function(x, weight=NULL) .Call(C_rowMeansX, x, weight) rowProd <- function(x) .Call(C_rowProd, x) SelfDivByRow <- function(x, v) .Call(C_DivByRow, x, v) quadratic <- function(x, v) .Call(C_quadratic, x, v) dotXV <- function(x, w) .Call(C_dotXV, x, w) dbinorm <- function(x, S) .Call(C_dbinorm, x, S) uses.simd.instruction <- function(which=NULL, pkgs=NULL) { .Call(C_instruction_set, which, pkgs, TRUE); } misses.simd.instruction <- function(which=NULL, pkgs=NULL) { .Call(C_instruction_set, which, pkgs, FALSE); } RandomFieldsUtils/R/zzz.R0000644000176200001440000000444714227157055015044 0ustar liggesusers ## Authors ## Martin Schlather, schlather@math.uni-mannheim.de ## ## Copyright (C) 2018 -- 2021 Martin Schlather ## ## This program is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License ## as published by the Free Software Foundation; either version 3 ## of the License, or (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ##.C <- function(...) { cat("\ndebugging version: "); x <- ...elt(1); print(if (is.list(x)) x[1] else x); .Call(C_DebugCall); z <- .C(...); cat("\nend call: "); .Call(C_DebugCall); z} ##.Call <- function(...) { cat("\ndebugging version: "); x <- ...elt(1); print(if (is.list(x)) x[1] else x); .Call(C_DebugCall); z <- .Call(...); cat("\nend call: "); .Call(C_DebugCall); z} .onLoad <- function(lib, pkg) { .C("loadoptions") } .onAttach <- function (lib, pkg) { n <- as.integer(0.75 * parallel::detectCores(logical=FALSE) + 0.25 * parallel::detectCores(logical=TRUE)) if (!is.na(n)) .C("setCPUs", n) # or 1 if not fully installed n <- .C("recompilationNeeded", n) if (n[[1]] > 0) packageStartupMessage(paste("Consider the use of one of \n\tRFoptions(install.control=list(force=FALSE))\n\tRFoptions(install.control=list(repos=NULL))\n\tRFoptions(install.control=NULL)\nIf you use it, the package might run faster the next time it is loaded.\nRecompilation is at your own risk. There is currently no guarantee that it will work.\nIn case of problems:", CONTACT)) else if (n[[1]] < 0) packageStartupMessage(paste(if (n[[1]] == -1) "R" else "Auto-r", "ecompilation didn't (fully) succeed, see 'RFoptions(install.control=list(pkgs=NULL))'.\n")) } .onDetach <- function(lib) { ## do not use the following commmands in .onDetach! } .onUnload <- function(lib, pkg){ # RFoptions(storing=FALSE) ## delete everything .C("detachoptions") } RandomFieldsUtils/R/maths.R0000644000176200001440000000500614227157055015313 0ustar liggesusers ## Authors ## Martin Schlather, schlather@math.uni-mannheim.de ## ## ## Copyright (C) 2015 -- 2021 Martin Schlather ## ## This program is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License ## as published by the Free Software Foundation; either version 3 ## of the License, or (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. I0L0 <- function(x) { storage.mode(x) <- "double" # res <- double(length(x)) .Call(C_I0ML0, x) } .struve <- function(x, nu, sign, expon.scaled) { storage.mode(x) <- "double" storage.mode(nu) <- "double" storage.mode(expon.scaled) <- "logical" storage.mode(sign) <- "double" # res <- double(max(length(x), length(nu))) .Call(C_struve, x, nu, sign, expon.scaled) } struveH <- function(x, nu) .struve(x, nu, -1, FALSE) struveL <- function(x, nu, expon.scaled=FALSE) .struve(x, nu, 1, expon.scaled) wmscale <- function(scaling) { switch(scaling, whittle = 0.0, matern = sqrt(2), handcockwallis = 2.0 ) } whittle <- function(x, nu, derivative=0, scaling=c("whittle", "matern", "handcockwallis")) { .Call(C_WMr, as.double(x), as.double(nu), as.integer(derivative), if (is.character(scaling)) wmscale(match.arg(scaling)) else as.double(scaling)) } matern <- function(x, nu, derivative=0, scaling=c("matern", "whittle", "handcockwallis")) { whittle(x, nu, derivative, if (is.character(scaling)) wmscale(match.arg(scaling)) else scaling) } besselKx <- function(x, nu) .Call(C_besselk_simd, x, nu) nonstwm <- function(x, y, nu, log=FALSE, scaling=c("whittle", "matern", "handcockwallis")) { if (is.function(nu)) { nu1 <- nu(x) nu2 <- nu(y) } else nu1 <- nu2 <- nu L <- .Call(C_logWMr, sqrt(sum((x - y)^2)), as.double(nu1), as.double(nu2), if (is.character(scaling)) wmscale(match.arg(scaling)) else as.double(scaling)) if (log) L else exp(L) } gauss <- function(x, derivative=0) { .Call(C_gaussr, as.double(x), as.integer(derivative)) } RandomFieldsUtils/R/aaa_auto.R0000644000176200001440000000355114227157055015754 0ustar liggesusers# This file has been created automatically by 'rfGenerateConstants' ## from src/AutoRandomFieldsUtils.h auto_rfutils_h <- as.integer(1) MAXUNITS <- as.integer(4) MAXCHAR <- as.integer(18) RFOPTIONS <- "RFoptions" CLASS_TRYERROR <- "try-error" WARN_UNKNOWN_OPTION_ALL <- as.integer(4) WARN_UNKNOWN_OPTION_SINGLE <- as.integer(3) WARN_UNKNOWN_OPTION_CAPITAL <- as.integer(2) WARN_UNKNOWN_OPTION_NONE1 <- as.integer(1) WARN_UNKNOWN_OPTION_NONE <- as.integer(0) CONTACT <- " Please contact the maintainer martin.schlather@math.uni-mannheim.de.\n" ## from src/AutoRandomFieldsUtilsLocal.h LA_INTERN <- as.integer(0) LA_R <- as.integer(1) LA_AUTO <- as.integer(2) LA_GPU <- as.integer(3) LA_QUERY <- as.integer(4) LA_LAST <- as.integer(LA_QUERY) PIVOT_NONE <- as.integer(0) PIVOT_DO <- as.integer(1) PIVOT_AUTO <- as.integer(2) PIVOT_IDX <- as.integer(3) PIVOT_UNDEFINED <- as.integer(4) PIVOT_LAST <- as.integer(PIVOT_UNDEFINED) PIVOTSPARSE_MMD <- as.integer(1) PIVOTSPARSE_RCM <- as.integer(2) Inone <- as.integer(0) Iinstall <- as.integer(1) Iask <- as.integer(2) Isse <- as.integer(3) Isse2 <- as.integer(4) Isse3 <- as.integer(5) Issse3 <- as.integer(6) Iavx <- as.integer(7) Iavx2 <- as.integer(8) Iavx512f <- as.integer(9) Igpu <- as.integer(10) INSTALL_LAST <- as.integer(Igpu) Cholesky <- as.integer(0) SVD <- as.integer(1) Eigen <- as.integer(2) Sparse <- as.integer(3) NoInversionMethod <- as.integer(4) QR <- as.integer(5) LU <- as.integer(6) NoFurtherInversionMethod <- as.integer(7) GPUcholesky <- as.integer(8) Rcholesky <- as.integer(9) direct_formula <- as.integer(10) Diagonal <- as.integer(11) nr_InversionMethods <- as.integer((Diagonal+1)) nr_user_InversionMethods <- as.integer((NoFurtherInversionMethod+1)) LAST_R_TYPE_NAME <- as.integer(32) RandomFieldsUtils/R/gpu.R0000644000176200001440000000142014227157055014766 0ustar liggesusers gpu_info <- function(devices = NULL){ # Query function for device info # devices should be a vector of integers with the correct device number # if no device argument is given, the standard value is taken from RFoptions if (length(devices) == 0) dev <- RFoptions()$installNrun$gpuDevices else if (!is.vector(devices) || any(devices != (dev <- as.integer(devices)))) stop("Devices have to be a vector of integers") gpu_list <- .Call("gpu_info", as.integer(dev)) class(gpu_list) <- "gpu_list" return(gpu_list) } print.gpu_list <- function(x, ...){ ## pretty print function for a gpu_list instance returned by gpu_info if(!is(x, "gpu_list")) stop("Wrong argument type") df <- do.call(rbind.data.frame, x) rownames(df) <- NULL print(df) # OK } RandomFieldsUtils/R/RFoptions.R0000644000176200001440000003603014227157055016123 0ustar liggesusers ## Authors ## Martin Schlather, schlather@math.uni-mannheim.de ## ## ## Copyright (C) 2015 -- 2021 Martin Schlather ## ## This program is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License ## as published by the Free Software Foundation; either version 3 ## of the License, or (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. summary.RFopt <- function(object, ...) { object <- lapply(object, function(z) z[order(names(z))]) object <- object[c(1, 1 + order(names(object[-1])))] class(object) <- "summary.RFopt" object } print.summary.RFopt <- function(x, ...) { str(x, give.attr=FALSE, ...) # OK invisible(x) } print.RFopt <- function(x, ...) { print.summary.RFopt(summary.RFopt(x, ...)) invisible(x) } summary.RFoptElmnt <- function(object, ...) { object <- object[order(names(object))] class(object) <- "summary.RFoptElmt" object } print.summary.RFoptElmnt <- function(x, ...) { str(x, give.attr=FALSE, ...) # OK invisible(x) } print.RFoptElmnt <- function(x, ...) { print.summary.RFoptElmnt(summary.RFoptElmnt(x, ...)) invisible(x) } detach_packages <- function(pkgs) { for (pkg in pkgs) { pkg <- paste0("package:", pkg) while(pkg %in% search()) detach(pkg, unload = TRUE, character.only=TRUE) } } libraries <- function(pkgs, control, verbose=FALSE) { if (length(control) > 0) { idx <- pmatch(names(control), names(as.list(args(library)))) control <- control[idx[!is.na(idx)]] } for (pkg in pkgs) do.call("library", c(list(pkg), control)) if (verbose) message("libraries attached.") } OneTo <- function(n) return(if (length(n) > 1) stop("invalid end of loop") else if (n < 1) NULL else 1:n) S <- function(x) if (length(x) > 1) "s" else "" ARE <- function(x) if (length(x) > 1) "are" else "is" HAVE <- function(x) if (length(x) > 1) "have" else "has" sources <- function(pkgs, raw=FALSE, repos=NULL, local.only=FALSE) { gitrepos <- "schlather/PACKAGES" gitinfo <- "https://github.com/" gitdownload <- "https://raw.githubusercontent.com/" debug <- FALSE ip <- installed.packages()[, "Version"] # OK ip <- ip[pkgs] names(ip) <- pkgs s <- if (local.only) "local" else c("local", "cran", "github") found <- matrix(FALSE, nrow=length(pkgs), ncol=length(s)) V <- where <- matrix("", nrow=length(pkgs), ncol=length(s)) dimnames(V) <- dimnames(where) <- dimnames(found) <-list(pkgs, s) for (frm in c("local0", s)) { from <- frm if (from == "local0") { from <- "local" url <- "" } else if (from == "local") url <- getwd() else { if (from == "cran") { type <- "source" if (length(repos) == 0) repos <- getOption("repos") if (debug) print(repos) ## OK cran <- NULL url <- try(contrib.url(repos=repos, type="source")) if (!is(url, "try-error")) { cran <- try(available.packages(contriburl = url)[pkgs, "Version"]) if (is(cran, "try-error") || length(cran) == 0) next } if (length(cran) == 0) next } else if (from == "github") { url <- paste0(gitinfo, gitrepos) github <- try(grep("tar.gz", fixed=TRUE, readLines(url), value = TRUE)) if (is(github, "try-error") || length(github) == 0) next } else stop("BUG") } for (i in 1:length(pkgs)) { if (from == "cran") { versions <- cran[i] ## length 1 } else { if (from == "local") { if (url == "") f <- dir(pattern=paste0(pkgs[i], "_.*\\.tar\\.gz")) else f <- dir(pattern=paste0(pkgs[i], "_.*\\.tar\\.gz"), path=url) } else { f <- grep(paste0(pkgs[i],"_"), github, value = TRUE) } if (length(f) > 0) { pkg <- paste0(pkgs[i],"_") versions <- sapply(strsplit(f, "\\.tar\\.gz"), function(x) { s <- strsplit(x[1], pkg)[[1]] s[length(s)] }) } else versions <- NULL } old.version <- ip[i] where[i, from] <- url for (j in OneTo(length(versions))) { cmp <- compareVersion(versions[j], ip[i]) if (cmp >= 0) { found[i, from] <- TRUE if (compareVersion(versions[j], old.version)) { old.version <- versions[j] V[i, from] <- versions[j] } } } } if (frm == "local") { ## NOT 'from ' if (all(anyfound <- apply(found, 1, any))) break; ## all found locally } } if (debug) Print(list(where=where, found=found, newer.version=V, ip=ip)) ## OK if (raw) return(list(where=where, found=found, newer.version=V, ip=ip)) failed <- !apply(found, 1, any) if (any(failed)) { if (all(failed)) return(list(what=NULL, failed=failed)) where <- where[!failed, , drop=FALSE] found <- found[!failed, , drop=FALSE] V <- V[!failed, , drop=FALSE] ip <- ip[!failed] pkgs <- pkgs[!failed] } what <- matrix("", nrow=length(ip), ncol=4) dimnames(what) <- list(names(ip), c("how", "where", "version", "call")) method <- colnames(V) if (all(apply(V == "", 1, any, na.rm=TRUE))) {## take ## all current iff all current are available. This is the safest. found[V != ""] <- FALSE dim(found) <- dim(where) } else if (all(what[, "cran"] != "")) found[, method != "cran"] <- FALSE ## take ## cran versions if all cran vesions are available; second safest since this necessitates ## that R version is recent enough ## Otherwise try the best, i.e. take always the most recent ones -- this reduced ## probability of version incompatibilities for (i in 1:length(ip)) { if (length(f <- which(found[i,])) == 0) next newest <- f[1] for (j in f[-1]) if (compareVersion(V[j], V[newest]) > 0) newest <- j what[i, 1:3] <- c(method[newest], where[i, newest], if (V[i, newest] == "") ip[i] else V[i, newest]) } idx <- what[, "how"] == "local" path <- what[idx, "where"] add <- path != "" & substring(path, nchar(path)) != .Platform$file.sep path[add] <- paste0(path, .Platform$file.sep) what[idx, "call"] <- paste0(path, pkgs[idx], "_", what[idx, "version"], ".tar.gz") idx <- what[, "how"] == "github" what[idx, "call"] <- paste0(gitdownload, gitrepos, "/main/", pkgs[idx], "_", what[idx, "version"], ".tar.gz") idx <- what[, "how"] == "cran" what[idx, "call"] <- pkgs[idx] if (debug) Print(t(what), failed) ## OK return(list(what=what, failed=failed)) } # pkgs <- c("RandomFieldsUtils", "miraculix", "RandomFields");print("XX"); print(s <- sources(pkgs)); tmp <- apply(found, 1, any) # https://raw.githubusercontent.com/schlather/PACKAGES/main/miraculix_1.0.2.tar.gz reinstallPackages <- function(ic, installNrun, install.control) { install <- installNrun$install mem_is_aligned <- installNrun$mem_is_aligned if (is.na(mem_is_aligned)) mem_is_aligned <- TRUE ## Print(installNrun) verbose <- FALSE force <- quiet <- CROSS <- pkgs.given <- path.given <- local.only <- FALSE repos <- path <- pkgs <- NULL if (ic) { N <- names(install.control) if ("pkg" %in% N) stop("'pkg' is an invalid option for 'install.control'. Did you mean 'pkgs'?") pkgs.given <- "pkgs" %in% N path.given <- "path" %in% N path <- install.control$path delete <- c("repos", "path", "force", "pkgs", "CROSS") for (arg in c(delete, "verbose", "quiet")) if (length(install.control[[arg]]) > 0) { assign(arg, install.control[[arg]]) if (arg %in% delete) install.control[[arg]] <- NULL } if (length(install.control$force) > 0 && !force) install <- "ask" else if (length(install) > 0 && install %in% c("ask", "no installation")) install <- "install" if ("MEM_IS_ALIGNED" %in% N) { mem_is_aligned <- install.control$MEM_IS_ALIGNED force <- TRUE } if ("LOCAL_ONLY" %in% N) local.only <- install.control$LOCAL_ONLY } if (!pkgs.given) pkgs <- .Call(C_getPackagesToBeInstalled, force) verbose <- verbose && !quiet if (length(pkgs) == 0) { .Call(C_SIMDmessages, "all") cat("See ?RFoptions for options.\n") if (!quiet) message(if (!pkgs.given) "No packages found to be installed.", if (!path.given && !pkgs.given) " Consider setting, in 'install.control', a path to a local directory.", if (verbose) " This happens particularly if the the installation process was interrupted. Try it again in the next session or use 'RFoptions(install.control=list(force=TRUE))' for instance.") return() } if (install == "ask") { if (!quiet) cat("The package", S(pkgs), " ", paste0("'", pkgs, "'", collapse=", "), " ", HAVE(pkgs), " been compiled without appropriate SIMD/AVX2 flags. So, calculations can be slow. If the package", S(pkgs), " ", ARE(pkgs), " recompiled with the necessary flags, the calculations might be faster.\nR should be restarted after re-compiling. The argument 'install.control' might be used to run the re-compilation without asking and to pass further arguments to 'install.packages', e.g., 'RFoptions(install.control=list(verbose=TRUE))'\nTo avoid this feedback, set 'RFoptions(install=\"no\")' or 'RFoptions(install=\"install\")' before calling any other function of '", pkgs[length(pkgs)],"'.\n\n", sep="") omp <- .Call(C_SIMDmessages, pkgs) } ## pkgs <- c("RandomFieldsUtils", "miraculix", "RandomFields");print("XX") if (!quiet) cat("Searching for tar balls... ") s <- sources(pkgs,repos=repos, local.only=local.only) cat("\n") if (all(s$failed)) { if (!quiet) cat("Not a single source found for re-installation.\n") return() } tell.which <- function(s, verbose) { cat("The following package", S(!s$failed), " will be re-installed:\n", sep="", paste0(if (!verbose) "\t", rownames(s$what), "_", s$what[, "version"], " from ", s$what[, "how"], if (verbose) ", ", if (verbose) s$what[, "where"], "\n") ) if (any(s$failed)) { cat("No recent tar ball found for ", paste0("'", names(s$failed)[s$failed], "'", collapse=", ", sep=""), ". ", sep="") if (verbose) cat("Consider calling\n\t'RFoptions(install.control=list(path=\"\",\n\t\t\tverbose=TRUE))'") cat("\n") } } ## tell.which(s, verbose) neon <- .Call(C_isNEONavailable) arm32 <- !is.na(neon) x86_64 <- .Call(C_isX86_64) CROSS_DEFAULT <- if (arm32) "arm32" else if (x86_64) "avx" else "FALSE" if ((asked = install == "ask")) { if (!quiet) tell.which(s, verbose) repeat { txt <- paste0("Shall '", rownames(s$what)[1], "' and all further packages based on 'RandomFieldsUtils' be recompiled (Y/n/h/s)erver/) ? ") install.control <- readline(txt) if (install.control %in% c("h", "H")) { cat("\nHelp info (see ?RFoptions Details..InstallNrun..install for details)\n ====================================================\n") cat("Y : installation \n") cat("n : interruption.\n") cat("s : CROSS=\"", CROSS_DEFAULT, "\".\n") cat(": arguments for 'install.packages',\n e.g. 'lib = \"~\", quite=TRUE'\n") cat("\n") } else break } install <- if (install.control %in% c("n", "N")) "no installation" else "install" path <- NULL if (install.control %in% c("s", "S")) CROSS <- CROSS_DEFAULT if (nchar(install.control) <= 3) install.control <-"" if (verbose) { if (install == "no installation") .Call(C_SIMDmessages, NULL) else { S <- "\t*************************************************\n" cat("\n", S, "\t*** Do not forget to restart R. ***\n",S) sleep.milli(1500) } } } else { omp <- .Call(C_SIMDmessages, "OMP") if (!quiet) tell.which(s, verbose) } if (install != "no installation") { if (is.character(install.control)) install.control <- eval(parse(text=paste("list(", install.control, ")"))) SIMD_FLAGS <- CXX_FLAGS <- args <- "" if (length(install.control$configue.args) > 0) { args <- install.control$configue.args install.control$configue.args <- NULL } if (length(install.control$CXX_FLAGS) > 0) { CXX_FLAGS <- install.control$CXX_FLAGS install.control$CXX_FLAGS <- omp <- NULL } if (length(install.control$SIMD_FLAGS) > 0) { SIMD_FLAGS <- install.control$SIMD_FLAGS install.control$SIMD_FLAGS <- NULL } if (length(install.control$USE_GPU) > 0) { usegpu <- if (install.control$USE_GPU) " USE_GPU=TRUE" else "" install.control$USE_GPU <- NULL } else usegpu <- if (.Call(C_isGPUavailable)) " USE_GPU=try" else "" #Print(.Call(C_isGPUavailable)) idx <- pmatch(names(install.control),names(as.list(args(install.packages)))) install.control <- install.control[which(!is.na(idx))] args <- paste0(args, usegpu, " USERASKED=", asked, " CROSS=", CROSS, " MEM_IS_ALIGNED=", mem_is_aligned, if (length(SIMD_FLAGS) > 0) paste0(" SIMD_FLAGS='", SIMD_FLAGS, "'"), if (length(CXX_FLAGS) + length(omp) > 0) paste0(" CXX_FLAGS='", CXX_FLAGS, " ", omp, "'") ) if (verbose) Print(install.control, args) ## OK how <- s$what[, "how"] pkgs <- s$what[, "call"] for (p in 1:nrow(s$what)) { z <- Try(do.call("install.packages", c(list(pkgs=pkgs[p], type="source", repos = if (how[p] == "cran") s$what[p, "where"] else NULL), install.control, configure.args=args))) if (is(z, "try-error")) print(z) ## OK } ## on.exit({detach_packages(rev(pkgs)); libraries(pkgs)}, add=TRUE) } cat("\n\n") } RFoptions <- function(..., no.class=FALSE, install.control=NULL) { opt <- .External(C_RFoptions, ...) ## if (is.list(opt)) Print(installNrun) else Print(opt) ic <- hasArg("install.control") ## print(opt) ## print(ic) if (ic || (length(opt) > 0 && is.list(opt) && is.list(opt$installNrun) && opt$installNrun$installPackages && interactive())) { reinstallPackages(ic=ic, installNrun=opt$installNrun, install.control=install.control) if (ic) return(invisible(NULL)) } if (length(opt) == 0 || no.class) return(invisible(opt)) if (is.list(opt[[1]])) { opt <- lapply(opt, function(x) { class(x) <- "RFoptElmnt" x }) class(opt) <- "RFopt" } else class(opt) <- "RFoptElmnt" opt } RandomFieldsUtils/R/internal_use.R0000644000176200001440000002702514227157055016674 0ustar liggesusers ## Authors ## Martin Schlather, schlather@math.uni-mannheim.de ## ## ## Copyright (C) 2015 -- 2021 Martin Schlather ## ## This program is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License ## as published by the Free Software Foundation; either version 3 ## of the License, or (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. debugging_level <- function() .Call(C_debuggingLevel) Try <- function(expr) { z <- tryCatch(expr, error = function(e) e) if (is.list(z) && !is.null(z$message) && !is.null(z$call)) class(z) <- CLASS_TRYERROR z } checkExamples <- function(exclude=NULL, include=1:length(.fct.list), ask=FALSE, echo=TRUE, halt=FALSE, ignore.all=FALSE, path=package, package="RandomFields", read.rd.files=TRUE, local = FALSE, libpath = NULL, single.runs = FALSE, reset, catcherror=TRUE) { ## print("A") .exclude <- exclude .ask <- ask .echo <- echo .halt <- halt .ignore.all <- ignore.all .package <- package .path <- path .local <- local ## useDynLib <- importClassesFrom <- import <- ## importFrom <- exportClasses <- ## importMethodsFrom <- exportMethods <- S3method <- ## function(...) NULL .env <- new.env() stopifnot(is.na(RFoptions()$basic$seed)) # OK exportPattern <- function(p) { ## necessary to read NAMESPACE??!! if (p == "^R\\.") p <- "^R." all.pattern <- p %in% c("^[^\\.]", "^[^.]", ".") | get("all.pattern", .env) if (!.ignore.all) assign("all.pattern", all.pattern, .env) if (all.pattern) return(NULL) stopifnot(nchar(p)==3, substr(p,1,1)=="^") ## OK assign("p", c(get("p", .env), substring(p, 2)), .env) } export <- function(...) { ## code from 'rm' dots <- match.call(expand.dots = FALSE)$... z <-deparse(substitute(...)) if (length(dots) && !all(sapply(dots, function(x) is.symbol(x) || is.character(x)))) stop("... must contain names or character strings") z <- sapply(dots, as.character) cat("export", z, "\n") assign("export", c(get("export", .env), z), .env) } import <- importClassesFrom <- importMethodsFrom <- importFrom <- useDynLib <- exportClasses <- S3method <- exportMethods <- function(...) { dots <- match.call(expand.dots = FALSE)$... # cat("other:", sapply(dots, as.character), "\n") } assign("export", NULL, .env) assign("all.pattern", FALSE, .env) assign("p", NULL, .env) ## cat("'source' causes problems in valgrind") .content <- readLines(paste(.path, "NAMESPACE", sep="/"), -1) eval(parse(text = .content)) ## cat("\tend source\n") if (is.logical(read.rd.files) && !read.rd.files) { .package.env <- parent.env(.GlobalEnv) while (attr(.package.env, "name") != paste("package:", .package, sep="")) { .package.env <- parent.env(.package.env) } .orig.fct.list <- ls(envir=.package.env) .ok <- (get("all.pattern", .env) | substr(.orig.fct.list, 1, 1) %in% get("p", .env) | .orig.fct.list %in% get("export", .env)) .fct.list <- .orig.fct.list[.ok] } else { if (is.logical(read.rd.files)) .path <- paste("./", .path, "/man", sep="") else .path <- read.rd.files .files <- dir(.path, pattern="d$") .fct.list <- character(length(.files)) for (i in 1:length(.files)) { #cat(i, .path, .files[i], "\n") #if (i == 152) {cat("jumped\n"); next} #Print(.path, .files[i]) #.content <- scan(paste(.path, .files[i], sep="/") , what=character(), # quiet=TRUE) .fn <- paste(.path, .files[i], sep="/") .content <- readLines(.fn, n = 2) if (substr(.content[1], 1, 5) != "\\name" && (substr(.content[1], 1, 1) != "%" || substr(.content[2], 1, 5) != "\\name")) stop(.files[i], " does not start with '\\name' -- what at least in 2018 has caused problems in valgrind") .content <- scan(.fn, what=character(), quiet=TRUE) .content <- strsplit(.content, "alias\\{") .content <- .content[which(lapply(.content, length) > 1)][[1]][2] .fct.list[i] <- strsplit(strsplit(.content,"\\}")[[1]][1], ",")[[1]][1] } } .include <- if (is.numeric(include)) include else 1:99999 .include.name <- include if (exists("RMexp")) RFoptions(graphics.close_screen = TRUE, graphics.split_screen = TRUE) .RFopt <- RFoptions() .not_working_no <- .not_working <- NULL .included.fl <- .fct.list[.include] .not.isna <- !is.na(.included.fl) .include <- .include[.not.isna] .included.fl <- .included.fl[.not.isna] .max.fct.list <- max(.included.fl) if (single.runs) { file.in <- "example..R" file.out <- "example..Rout" if (file.exists(file.out)) file.remove(file.out) } if (is.character(.include.name)) { .include.name <- sapply(strsplit(.include.name, ".Rd"), function(x) x[1]) } .allwarnings <- list() .tryerror <- paste0("\"try-", "error\""); for (.idx in .include) { ## Print(.idx) if (is.character(.include.name) && !(.fct.list[.idx] %in% .include.name)) next tryCatch(repeat dev.off(), error = function(e) e) if (.idx %in% .exclude) next cat("\n\n\n\n\n", .idx, " ", .package, ":", .fct.list[.idx], " (total=", length(.fct.list), ") \n", sep="") RFoptions(list_=.RFopt) if (!missing(reset)) do.call(reset) if (.echo) cat(.idx, "") .tryok <- TRUE if (single.runs) { txt <- paste("library(", package,", ", libpath, "); example(", .fct.list[.idx], ", ask =", .ask, ", echo =", .echo, ")", sep="") write(file=file.in, txt) command <- paste("R < ", file.in, ">>", file.out) } else { ##s topifnot(RFoptions()$basic$print <=2) ## Print(.fct.list[.idx], package) if (catcherror) .time <- system.time(.res <- try(do.call(utils::example, ## OK list(.fct.list[.idx], ask=.ask, package=package, echo=.echo, local=.local)))) else .time <- system.time(.res <- do.call(utils::example, list(.fct.list[.idx], ask=.ask, package=package, echo=.echo, local=.local))) w <- warnings() .allwarnings <- c(.allwarnings, list(c("Help page ", .idx)), w) if (length(w) > 0) print(w) ## OK if (is(.res, CLASS_TRYERROR) || is(.res, .tryerror) || is(.res, "SimpleError") || is(.res, "error")) { cat("ERROR:\n") str(.res, give.head=FALSE) # OK if (.halt) { stop("\n\n\t***** ",.fct.list[.idx], " (", .idx, " out of ", max(.include), "). has failed. *****\n\n") } else { .not_working_no <- c(.not_working_no, .idx) .not_working <- c(.not_working, .fct.list[.idx]) .tryok <- FALSE } } if (exists("RMexp")) RFoptions(storing = FALSE) cat("****** '", .fct.list[.idx], "' (", .idx, ") done. ******\n") print(.time) # OK if (.tryok && !is.na(RFoptions()$basic$seed)) { Print(.not_working, paste(.not_working_no, collapse=", "), # OK RFoptions()$basic$seed) stop("seed not NA: ", .fct.list[.idx]) } } } Print(.not_working, paste(.not_working_no, collapse=", ")) # OK .ret <- list(.not_working, .not_working_no) names(.ret) <- c(.package, "") return(.ret) } reverse_dependencies_with_maintainers <- function(packages, which = c("Depends", "Imports", "LinkingTo"), recursive = FALSE) { ## function taken from CRAN developer website. repos <- getOption("repos")["CRAN"] ## if (substr(repos, 1, 1) == "@") repos <- "http://cran.r-project.org" Print(repos) # OK contrib.url(repos, "source") # trigger chooseCRANmirror() if required description <- sprintf("%s/web/packages/packages.rds", repos) con <- if(substring(description, 1L, 7L) == "file://") file(description, "rb") else url(description, "rb") on.exit(close(con)) db <- readRDS(gzcon(con)) rownames(db) <- NULL rdepends <- tools::package_dependencies(packages, db, which, recursive = recursive, reverse = TRUE) rdepends <- sort(unique(unlist(rdepends))) pos <- match(rdepends, db[, "Package"], nomatch = 0L) db <- db[pos, c("Package", "Version", "Maintainer")] if (is.vector(db)) dim(db) <- c(1, length(db)) db } ShowInstallErrors <- function(dir=".", pkgs=unlist(strsplit( dir(pattern="*.Rcheck"), ".Rcheck"))) for (i in 1:length(pkgs)) { cat("\n\n", pkgs[i], "\n") for (f in c("00install.out", "00check.log")) { system(paste("grep [eE][rR][rR][oO][rR] ", dir, "/", pkgs[i], ".Rcheck/", f, sep="")) system(paste("grep \"user system elapsed\" -A 2 ", dir, "/", pkgs[i], ".Rcheck/", f, sep="")) ## system(paste("grep \"Warning messages\" -A 4 ", dir, "/", pkgs[i], ## ".Rcheck/", f, sep="")) ### find -type f -name "00*" -exec grep Warning {} \; -print ### find -type f -name "00*" -exec grep "user system elapse" -A 3 {} \; -print } } ## library(RandomFields); Dependencies(package="RandomFields", install=TRUE, reverse=FALSE) Dependencies <- function(pkgs = all.pkgs, dir = "Dependencies", install = FALSE, check=TRUE, reverse=FALSE, package="RandomFields") { Print(utils::packageDescription(package)) # OK all <- reverse_dependencies_with_maintainers(package #, which="Suggests") , which="all") all.pkgs <- all[, 1] PKGS <- paste(all[,1], "_", all[,2], ".tar.gz", sep="") ## getOption("repos")["CRAN"] URL <- "http://cran.r-project.org/src/contrib/" if (install) { system(paste("mkdir ", dir)) system(paste("rm ", dir, "/*tar.gz*", sep="")) for (i in 1:length(pkgs)) { cat("PACKAGE:", PKGS[i], ":", i, "out of ", length(pkgs),"\n") x <- system(paste("(cd ", dir, "; wget ", URL, PKGS[i], ")", sep="")) if (x != 0) stop(PKGS[i], "not downloadable") ## extended version see RandomFields V 3.0.51 or earlier } } if (!hasArg("pkgs")) { if (check) { reverse <- if (reverse) list(repos = getOption("repos")["CRAN"]) else NULL tools::check_packages_in_dir(dir=dir, check_args = c("--as-cran", ""), reverse=reverse) } ShowInstallErrors(dir, pkgs) return(NULL) } else { ### old: if (check) { for (j in 1:length(pkgs)) { i <- pmatch(pkgs[j], PKGS) if (is.na(i)) next command <- paste("(cd ", dir, "; time R CMD check --as-cran", PKGS[i],")") Print(command) # OK x <- system(command) ShowInstallErrors(dir, pkgs) if (x != 0) stop(PKGS[i], "failed") } } } } # R Under development (unstable) (2014-12-09 r67142) -- "Unsuffered Consequences" # Dependencies(check=FALSE) RandomFieldsUtils/MD50000644000176200001440000001240714227516720014164 0ustar liggesusers4d830fef828db1907873e029de09d19a *DESCRIPTION f2087647a42dec259a0a09d861fb8021 *NAMESPACE dee5a5aa6478973e24fbfec3a64fbae6 *R/RFoptions.R 2589122b024aec1c487bd2b8994d0ea6 *R/aaa_auto.R 338ea990737140ea242f076828bec7d3 *R/gpu.R 8c720cf5ac10a3d3cee811fa9cf3e83b *R/internal_use.R 393b4758f869015663b50a6be05ed7a3 *R/maths.R b17d1432c0ddba6dc3e116837f8bf3fa *R/utils.R 5434648b54cfdd5e6d1867d16b97c6e5 *R/zzz.R 52eac060dd38a477e4130553a02bc70c *cleanup 9519f65bb1de406eeb0ad4760e9a6bf5 *configure 4133f6bba97fad29e54cb1d89cf58e66 *configure.ac fb39a615a88c13b1e6c1537d5194d27b *inst/CITATION 8658910d11a4015a0441aaeac8942a0d *inst/include/AutoRandomFieldsUtils.h afb81e101195cc0392313231a3c0a31e *inst/include/AutoRandomFieldsUtilsLocal.h 3b70c8fb2c89d78aa8763d1761418775 *inst/include/Basic_utils.h 605c12ae0197f0c16b4ce1ac8540ef2a *inst/include/Basic_utils_local.h 77bf752ebf581380775d9e3a8f5d6ea0 *inst/include/General_utils.h 8fa0460f4a37098acf60741656a8b5be *inst/include/RFU.h 3c2c33085a75ba41c8b8f57c2a32d137 *inst/include/RandomFieldsUtils.h 884929554f7a57217468d6e9009662b3 *inst/include/Utils.h ae8407e4ef96c83c2185a7cfd9177ecc *inst/include/def.h cdb1f5988b7b8ceefcc90d38a5beff15 *inst/include/errors_messages.h 401e9c5bbf057dde0181b0c93cbddac9 *inst/include/extern.h 33160af017d360878298a5ce97d4bd26 *inst/include/intrinsics.h 07d984831409907609ec3020bb864e42 *inst/include/kleinkram.h 7460900ab4ff320a19ae16a323b29237 *inst/include/options.h 5dddf12c5fbc921396629afd0ba4d27b *inst/include/parallel_base.h 5605064df51fb3e41801047e27f4c38a *inst/include/parallel_simd.h 69eeccffb239d6040e7de658ef5109ff *inst/include/solve_gpu.h 16264416ab52a13090a5fef5457ce860 *inst/include/sse2neon.h 06a725126e9ab85f3985a65b42d8a442 *inst/include/win_linux_aux.h 1820d683e317f6aaf10a889026a00305 *inst/include/xport_import.h 6f17c5d3f97daa734f7e7e15c7f75817 *inst/include/zzz_RandomFieldsUtils.h 85b0d63f4b6367bc65e016bc49b2b66d *inst/include/zzz_calls.h 608853e9e8c6e930bc7bd368dbb1c6c4 *man/Print.Rd 6b6ba658a4feaca91ca28607a6aeaeec *man/RFoptions.Rd 68ff48999850fece688970a050833cde *man/Struve.Rd 250312a5b67267e044adce7e70999a17 *man/cholPosDef.Rd b2b145f200d3f632242cd0c6de5ff30c *man/confirm.Rd 5fa2c983fd6d5dd23a9768f7acca83ac *man/dbinorm.Rd 8b886b6166e087e0569ffd15955a0522 *man/fileexists.Rd f3110a882c44dfef105d993bc0bcf1c1 *man/gauss.Rd 7f46ac2b9ab8d2e72d75abe74ba088a9 *man/hostname.Rd 090c6219b75a9c02efb5779830355a4a *man/internal.Rd ea625c52484ff908a3b7326fc1740c0d *man/macros/allg_defn.Rd 49a50d2055b191c46bc8c1f745cf7f22 *man/matern.Rd 96919b3be884ffbcfff904c1becd4de6 *man/nonstwm.Rd d1aae9f8bcf5a0d2ffd370c2db7048ad *man/orderx.Rd a6c6d4db5a795c5d9b6f856f7c6c3b11 *man/rowMeansx.Rd b2cfc542e75708582e9f607f50ddb88a *man/sleep.Rd 6d7463915d90bd7ab28d9b4ae3f7c1e8 *man/solvePosDef.Rd 7034e7a90104522c8da4f95a536472de *man/sortx.Rd 17fbea7d9c509a41968ccc35fbaaaca5 *man/uses.instruction.set.Rd 7d6e672314edc987f8531fa9068f991b *src/AutoRandomFieldsUtils.cc 8658910d11a4015a0441aaeac8942a0d *src/AutoRandomFieldsUtils.h afb81e101195cc0392313231a3c0a31e *src/AutoRandomFieldsUtilsLocal.h 3b70c8fb2c89d78aa8763d1761418775 *src/Basic_utils.h 605c12ae0197f0c16b4ce1ac8540ef2a *src/Basic_utils_local.h 77bf752ebf581380775d9e3a8f5d6ea0 *src/General_utils.h ed4c603a3e78f3b42d1064af17cc62d6 *src/Makevars.in 08ecd29d8b449c057b557ed09b9eaba3 *src/Makevars.win 8fa0460f4a37098acf60741656a8b5be *src/RFU.h 986d21c9fa1e4ba54655031332ff8078 *src/RFoptions.cc 3c2c33085a75ba41c8b8f57c2a32d137 *src/RandomFieldsUtils.h 884929554f7a57217468d6e9009662b3 *src/Utils.h d7fe2390781151860f592e97e8b98656 *src/avx2_fctns.cc 136dc23041b2bea81ab75aeb8bd2151a *src/avx_fctns.cc 9403b6ee8cc2a8043458e693a56a53fc *src/bckslvmodified.f 742c238402c47696cd114884209beaec *src/beskf.cc efb1d5113504617e6f4cbad4840e66b2 *src/brdomain.cc a1419ec40f106a5f2f250490df94d58c *src/cholmodified.f ae8407e4ef96c83c2185a7cfd9177ecc *src/def.h cdb1f5988b7b8ceefcc90d38a5beff15 *src/errors_messages.h 0a4d203ebe796e14f9739faa0699e395 *src/extern.cc 401e9c5bbf057dde0181b0c93cbddac9 *src/extern.h d4c71dac26ad8599cd34a1390693972d *src/gpu_info.cc 2a0d4f0f52b415bb8c315e6bcb0c462b *src/gpu_info_61.cu 33160af017d360878298a5ce97d4bd26 *src/intrinsics.h 0f8ade3ef315fd1db6a8ce5750ce0e4b *src/kleinkram.cc 07d984831409907609ec3020bb864e42 *src/kleinkram.h 249080b5560ec4b9c756b67afb9725f8 *src/maths.cc 89876301987287494375d3be189707b3 *src/obsolete.cc ae559015f2d35b656164308381c1058c *src/options.cc 7460900ab4ff320a19ae16a323b29237 *src/options.h 5dddf12c5fbc921396629afd0ba4d27b *src/parallel_base.h 5605064df51fb3e41801047e27f4c38a *src/parallel_simd.h 4ec51bdb2d2382372d6e57a89de7e3a3 *src/solve.cc 230e08a45ae40c411620c1cd78188f6b *src/solve_61.cu 69eeccffb239d6040e7de658ef5109ff *src/solve_gpu.h d9168a5620e0a871effa69d7192bdea6 *src/sort.cc a575ce6abf2a8c4e466c9ace3b4b6f15 *src/sortLong.cc 5b9f10f26148d88f9bd7c31fff83b376 *src/spamown.f 16264416ab52a13090a5fef5457ce860 *src/sse2neon.h 43eebb8901606ccee6273a86ef195e5a *src/utils.cc 64f80517264fe4022be652ce77618fea *src/win_linux_aux.cc 06a725126e9ab85f3985a65b42d8a442 *src/win_linux_aux.h a52b9349cad05b3d1ebade5f4a528032 *src/xport_import.cc 1820d683e317f6aaf10a889026a00305 *src/xport_import.h 10e10835c9b19d4a76ab6cd6d4be26d6 *src/zzz.c 6f17c5d3f97daa734f7e7e15c7f75817 *src/zzz_RandomFieldsUtils.h 85b0d63f4b6367bc65e016bc49b2b66d *src/zzz_calls.h RandomFieldsUtils/inst/0000755000176200001440000000000014227157056014630 5ustar liggesusersRandomFieldsUtils/inst/include/0000755000176200001440000000000014227157055016252 5ustar liggesusersRandomFieldsUtils/inst/include/General_utils.h0000644000176200001440000000174214227157055021224 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef rfutils_general_H #define rfutils_general_H 1 #if defined OBSOLETE_RFU #include "RFU.h" #include "kleinkram.h" #else #include "errors_messages.h" #include "options.h" #endif #endif RandomFieldsUtils/inst/include/RFU.h0000644000176200001440000000253014227157055017057 0ustar liggesusers#ifndef RFU_rfutils_h #define RFU_rfutils_h 1 #include "errors_messages.h" //#define SCALAR_RU_H 1 #define SCALAR_BASE 0 #define SCALAR_AVX 1 #define SCALAR_NEARFMA 6 // never change number, see haplogeno.R !! #define SCALAR_KAHAN 8 #define SOLVE 0 #define MATRIXSQRT 1 #define DETERMINANT 2 #define SOLVE_METHODS 3 typedef // benoetigt struct solve_storage { errorstring_type err_msg; InversionMethod method, newMethods[SOLVE_METHODS]; usr_bool sparse; int size, actual_size, actual_pivot; int nsuper; Long n_main, n_rhs, n_w2, n_U, n_D, n_w3, n_lnz, n_result; // SICH, n_MM, n_VT, n_ work, n_ nnzlindx, int *pivot_idx, n_pivot_idx, *iwork, n_iwork, //eigen, svd, LU, spam *pivotsparse, n_pivotsparse, *xlnz, n_xlnz, //spam *snode, n_snode, *xsuper, n_xsuper, *invp, n_invp, // spam *cols, n_cols, *rows, n_rows, *lindx, n_lindx, // spam *xja, n_xja; // chol, eigen, spam double *main, *rhs,// diagonal, general -- FORBIDDEN for further use *w2, // eigen, svd, LU, QR, pivot *U, // eigen, svd, pivot *D, // eigen, svd, cholesky, spam, pivot *w3, // spam, QR, svd, eigen *lnz, // spam, svd *result, // sqrtPosDefFree *to_be_deleted; } solve_storage; #define LINEAR_BASE 0 #define LINEAR_AVX 1 void linearX(double *x, double y, Long len, double *out, Long n); #endif RandomFieldsUtils/inst/include/def.h0000644000176200001440000000207614227157055017166 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /// sysconf (_SC_NPROCESSORS_ONLN) // number of cores available // int get_nprocs (void) // dito #ifndef RFUdef_H #define RFUdef_H 1 //// 1 //// 1 //// 1 #if ! defined SCHLATHERS_MACHINE && defined SCHLATHER_DEBUGGING #undef SCHLATHER_DEBUGGING #else //// 1 #endif // // 1 #endif RandomFieldsUtils/inst/include/RandomFieldsUtils.h0000644000176200001440000000434414227157055022020 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef RFutils_public_H #define RFutils_public_H 1 #ifdef __cplusplus extern "C" { #endif SEXP scalarR(SEXP x, SEXP y, SEXP mode); SEXP struve(SEXP X, SEXP Nu, SEXP Factor_Sign, SEXP Expscaled); SEXP besselk_simd(SEXP X, SEXP Nu); SEXP I0ML0(SEXP X); SEXP gaussr(SEXP X, SEXP Derivative); SEXP WMr(SEXP X, SEXP Nu, SEXP Derivative, SEXP Factor); SEXP logWMr(SEXP X, SEXP Nu1, SEXP Nu2, SEXP Factor); SEXP SolvePosDefR(SEXP M, SEXP rhs, SEXP logdet); SEXP Chol(SEXP M); SEXP RFoptions(SEXP options); void loadoptions(); void detachoptions(); SEXP sortX(SEXP Data, SEXP From, SEXP To, SEXP NAlast); SEXP orderX(SEXP Data, SEXP From, SEXP To, SEXP NAlast); SEXP colMaxs(SEXP M); SEXP rowMeansX(SEXP M, SEXP Factor); SEXP rowProd(SEXP M); SEXP chol2mv(SEXP Chol, SEXP N); SEXP tcholRHS(SEXP C, SEXP RHS); SEXP DivByRow(SEXP M, SEXP V); SEXP quadratic(SEXP x, SEXP A); SEXP dbinorm(SEXP X, SEXP Sigma); SEXP dotXV(SEXP M, SEXP V); // void Ordering(double *d, int *len, int *dim, int *pos); SEXP crossprodX(SEXP X, SEXP Y, SEXP mode); SEXP DebugCall(); SEXP getPackagesToBeInstalled(SEXP Force); SEXP isGPUavailable(); SEXP isNEONavailable(); SEXP isX86_64(); void setCPUs(int *n); void recompilationNeeded(int *n); SEXP SIMDmessages(SEXP pkgs); SEXP debuggingLevel(); SEXP gpu_info(SEXP DEVICES); SEXP instruction_set(SEXP which, SEXP pkgs, SEXP used); #ifdef __cplusplus } #endif #endif RandomFieldsUtils/inst/include/errors_messages.h0000644000176200001440000001415014227157055021627 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ // Datei wi #ifndef rfutils_error_H #define rfutils_error_H 1 #define NOERROR 0 #define ERRORMEMORYALLOCATION 1 #define ERRORFAILED 2 /* method didn't work for the specified parameters */ #define ERRORNOTPROGRAMMEDYET 3 #define ERRORM 4 /* a single error message */ #define ERRORMEND 12 /* a single error message -- und alles dazwischen */ #ifdef SCHLATHERS_MACHINE #define ERRLINE PRINTF("(ERROR in %s, line %d)\n", __FILE__, __LINE__) #else #define ERRLINE #endif #ifndef ERR #define ERR ERR0 #endif #define ERR0(X) {ERRLINE; RFERROR(X);} #define ERR00(X) ERRLINE; errorstring_type E_AUX; #define ERR1(X,Y) {ERR00(X);SPRINTF(E_AUX,X,Y); RFERROR(E_AUX);} #define ERR2(X,Y,Z) {ERR00(X);SPRINTF(E_AUX,X,Y,Z); RFERROR(E_AUX);} #define ERR3(X,Y,Z,A) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A); RFERROR(E_AUX);} #define ERR4(X,Y,Z,A,B) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A,B); RFERROR(E_AUX);} #define ERR5(X,Y,Z,A,B,C) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A,B,C); RFERROR(E_AUX);} #define ERR6(X,Y,Z,A,B,C,D) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A,B,C,D); RFERROR(E_AUX);} #define ERR7(X,Y,Z,A,B,C,D,E) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A,B,C,D,E); RFERROR(E_AUX);} #define ERR8(X,Y,Z,A,B,C,D,E,F) {ERR00(X);SPRINTF(E_AUX,X,Y,Z,A,B,C,D,E,F); RFERROR(E_AUX);} #ifndef LOCAL_ERRORSTRING #define LOCAL_ERRORSTRING errorstring_type loc_errorstring #endif #ifndef WHICH_ERRORSTRING #define WHICH_ERRORSTRING loc_errorstring #endif #define FERR0(X) LOCAL_ERRORSTRING; \ STRNCPY(WHICH_ERRORSTRING, X, MAXERRORSTRING); DEBUGINFOERR #if ! defined FERR #define FERR FERR0 #endif #define FERR1(X,Y) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING, X, Y); DEBUGINFOERR #define FERR2(X,Y,Z) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING, X, Y, Z); DEBUGINFOERR #define FERR3(X,Y,Z,A) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING, X, Y, Z, A); DEBUGINFOERR #define FERR4(X,Y,Z,A,B) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING,X,Y,Z,A,B); DEBUGINFOERR #define FERR5(X,Y,Z,A,B,C) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING,X,Y,Z,A,B,C); DEBUGINFOERR #define FERR6(X,Y,Z,A,B,C,D) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING,X,Y,Z,A,B,C,D); DEBUGINFOERR #define FERR7(X,Y,Z,A,B,C,D,E) LOCAL_ERRORSTRING; \ SPRINTF(WHICH_ERRORSTRING,X,Y,Z,A,B,C,D,E); DEBUGINFOERR #ifndef LOCAL_ERROR #define LOCAL_ERROR(N) {} #endif #define NERR00(N) LOCAL_ERROR(N); return N; #define NERR0(N,X) { FERR0(X); NERR00(N)} #if ! defined NERR #define NERR NERR0 #endif #define NERR1(N,X,Y) { FERR1(X, Y); NERR00(N)} #define NERR2(N,X, Y, Z) { FERR2(X, Y, Z); NERR00(N)} #define NERR3(N,X, Y, Z, A) { FERR3(X, Y, Z, A); NERR00(N)} #define NERR4(N,X, Y, Z, A, B) { FERR4(X, Y, Z, A, B); NERR00(N)} #define NERR5(N,X, Y, Z, A, B, C) { FERR5(X, Y, Z, A, B, C); NERR00(N)} #define NERR6(N,X, Y, Z, A, B, C, D) { FERR6(X, Y, Z, A,B,C,D); NERR00(N)} #define NERR7(N,X,Y,Z, A, B, C, D, E) { FERR7(X,Y,Z,A,B,C,D,E); NERR00(N)} #define SERR0(X) NERR0(ERRORM, X) #if ! defined SERR #define SERR SERR0 #endif #define SERR1(X,Y) NERR1(ERRORM, X, Y) #define SERR2(X,Y,Z) NERR2(ERRORM, X, Y, Z) #define SERR3(X,Y,Z, A) NERR3(ERRORM, X, Y, Z, A) #define SERR4(X,Y,Z, A, B) NERR4(ERRORM, X, Y, Z, A, B) #define SERR5(X,Y,Z, A, B, C) NERR5(ERRORM, X, Y, Z, A, B, C) #define SERR6(X,Y,Z, A, B, C, D) NERR6(ERRORM, X, Y, Z, A, B, C, D) #define SERR7(X,Y,Z, A, B, C, D, E) NERR7(ERRORM, X, Y, Z, A, B, C, D, E) #define CERR00 err=ERRORM; continue; #define CERR0(X) { FERR0(X); CERR00} #if ! defined CERR #define CERR CERR0 #endif #define CERR1(X,Y) { FERR1(X, Y); CERR00} #define CERR2(X, Y, Z) { FERR2(X, Y, Z); CERR00} #define CERR3(X, Y, Z, A) { FERR3(X, Y, Z, A); CERR00} #define GERR00 LOCAL_ERROR(ERRORM); err = ERRORM; goto ErrorHandling; #define GERR0(X) {FERR0(X); GERR00} #if ! defined GERR #define GERR GERR0 #endif #define GERR1(X,Y) {FERR1(X,Y); GERR00} #define GERR2(X,Y,Z) {FERR2(X,Y,Z); GERR00} #define GERR3(X,Y,Z,A) {FERR3(X,Y,Z,A); GERR00} #define GERR4(X,Y,Z,A,B) {FERR4(X,Y,Z,A,B); GERR00} #define GERR5(X,Y,Z,A,B,C) {FERR5(X,Y,Z,A,B,C); GERR00} #define GERR6(X,Y,Z,A,B,C,D) {FERR6(X,Y,Z,A,B,C,D); GERR00} #define GNERR00(N) err = N; goto ErrorHandling; #define GNERR0(N,X) {FERR0(X); GNERR00(N)} #if ! defined GNERR #define GNERR GNERR0 #endif #define GNERR1(N,X,Y) {FERR1(X,Y);GNERR00(N)} #define GNERR2(N,X,Y,Z) {FERR2(X,Y,Z); GNERR00(N)} #define GNERR3(N,X,Y,Z,A) {FERR3(X,Y,Z,A); GNERR00(N)} #define GNERR4(N,X,Y,Z,A,B) {FERR4(X,Y,Z,A,B); GNERR00(N)} #define GNERR5(N,X,Y,Z,A,B,C) {FERR5(X,Y,Z,A,B,C); GNERR00(N)} #define GNERR6(N,X,Y,Z,A,B,C,D) {FERR6(X,Y,Z,A,B,C,D); GNERR00(N)} #define RFWARNING warning #define WARN0 RFWARNING #define WARN1(X, Y) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y); RFWARNING(W_MSG);} #define WARN2(X, Y, Z) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y, Z); RFWARNING(W_MSG);} #define WARN3(X, Y, Z, A) {errorstring_type W_MSG;\ SPRINTF(W_MSG, X, Y, Z, A); RFWARNING(W_MSG);} #define WARN4(X, Y, Z, A, B) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y, Z, A, B); RFWARNING(W_MSG);} #define WARN5(X, Y, Z, A, B, C) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y, Z, A, B, C); RFWARNING(W_MSG);} #define WARN6(X, Y, Z, A, B,C,D) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y, Z, A, B, C, D); RFWARNING(W_MSG);} #define WARN7(X, Y, Z,A,B,C,D,E) {errorstring_type W_MSG; \ SPRINTF(W_MSG, X, Y, Z, A, B, C, D, E); RFWARNING(W_MSG);} #endif RandomFieldsUtils/inst/include/kleinkram.h0000644000176200001440000002521314227157055020403 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ // by 3.2.2021: xAx:: BLAS lohnt noch nicht // A^t A: BLAS lohnt sich ab aA = k x n, k >=8, n > MAXOWN #ifndef kleinkram_rfutils_h #define kleinkram_rfutils_h 1 #if defined OBSOLETE_RFU && !defined RFU_NEED_OBSOLETE // #warning OBSOLETE_RFU void strcopyN(char *dest, const char *src, int n); usr_bool UsrBool(SEXP p, char *name, int idx); usr_bool UsrBoolRelaxed(SEXP p, char *name, int idx); #define INT Integer(el, name, 0) #define LOGI Logical(el, name, 0) #define NUM Real(el, name, 0) #define USRLOG UsrBool(el, name, 0) #define USRLOGRELAXED UsrBoolRelaxed(el, name, 0) #define CHR Char(el, name) #define STR(X, N) strcopyN(X, CHAR(STRING_ELT(el, 0)), N); #define POS0INT NonNegInteger(el, name) /* better: non-negative */ #define POS0NUM NonNegReal(el, name) #define NEG0NUM NonPosReal(el, name) #define POSINT PositiveInteger(el, name) /* better: non-negative */ #define POSNUM PositiveReal(el, name) SEXP Logic(bool* V, int n, int max) ; SEXP Num(double* V, int n, int max) ; SEXP Int(int *V, int n, int max) ; SEXP Char(const char **V, int n, int max) ; SEXP Mat(double* V, int row, int col, int max); SEXP Mat_t(double* V, int row, int col, int max); SEXP MatInt(int* V, int row, int col, int max) ; SEXP MatString(char **V, int row, int col, int max); //SEXP Array3D(int** V, int depth, int row, int col, int max) ; SEXP String(char *V); SEXP Logic(bool* V, int n) ; SEXP Num(double* V, int n) ; SEXP Int(int *V, int n) ; SEXP Char(const char **V, int n) ; SEXP Mat(double* V, int row, int col); SEXP Mat_t(double* V, int row, int col); SEXP MatInt(int* V, int row, int col) ; SEXP MatString(char** V, int row, int col); //SEXP Array3D(int** V, int depth, int row, int col) ; SEXP String(char V[][MAXCHAR], int n, int max); SEXP String(int *V, const char * List[], int n, int endvalue); SEXP TooLarge(int *n, int l); SEXP TooSmall(); double Real(SEXP p, char *name, int idx); void Real(SEXP el, char *name, double *vec, int maxn) ; int Integer(SEXP p, char *name, int idx, bool nulltoNA) ; int Integer(SEXP p, char *name, int idx); void Integer(SEXP el, char *name, int *vec, int maxn) ; void Integer2(SEXP el, char *name, int *vec) ; bool Logical(SEXP p, char *name, int idx); char Char(SEXP el, char *name) ; double NonNegInteger(SEXP el, char *name) ; double NonNegReal(SEXP el, char *name) ; double NonPosReal(SEXP el, char *name) ; double PositiveInteger(SEXP el, char *name) ; double PositiveReal(SEXP el, char *name) ; void String(SEXP el, char *name, char names[][MAXCHAR], int maxlen); #define MULTIPLEMATCHING -2 #define NOMATCHING -1 #define MATCHESINTERNAL -3 int Match(char *name, const char * List[], int n); int Match(char *name, name_type List, int n); SEXP ExtendedInteger(double x); SEXP ExtendedBooleanUsr(usr_bool x); double XkCXtl(double *X, double *C, int nrow, int dim, int k, int l); void XCXt(double *X, double *C, double *V, int nrow, int dim); void AtA(double *a, int nrow, int ncol, double *A); void xA(double *x, double*A, int nrow, int ncol, double *y); void xA_noomp(double *x, double*A, int nrow, int ncol, double *y); void xA(double *x1, double *x2, double*A, int nrow, int ncol, double *y1, double *y2); void xAx(double *x, double*A, int nrow, double *y); void Ax(double *A, double*x, int nrow, int ncol, double *y); void Ax(double *A, double*x1, double*x2, int nrow, int ncol, double *y1, double *y2); double xUy(double *x, double *U, double *y, int dim); double xUxz(double *x, double *U, int dim, double *z); double x_UxPz(double *x, double *U, double *z, int dim); double xUx(double *x, double *U, int dim); void matmult(double *A, double *B, double *C, int l, int m, int n); void matmulttransposed(double *A, double *B, double *C, int m, int l, int n); void matmult_2ndtransp(double *A, double *B, double *C, int m, int l, int n); void matmult_tt(double *A, double *B, double *C, int m, int l, int n); double *matrixmult(double *m1, double *m2, int dim1, int dim2, int dim3); void GetName(SEXP el, char *name, const char * List[], int n, int defaultvalue, int endvalue, int *ans, int maxlen_ans); int GetName(SEXP el, char *name, const char * List[], int n) ; int GetName(SEXP el, char *name, const char * List[], int n, int defaultvalue) ; #define SCALAR_PROD(A, B, N, ANS) { \ int k_ =0, \ end_ = N - 4; \ ANS = 0.0; \ for (; k_ // uintptr_t #include "def.h" #include "parallel_simd.h" #if defined MINGWCPUID #include #elif defined WINCPUID //#warning loading intrin.h the first time #include #endif //#if defined _ _ARM_NEON //#include //#if defined(_ _LP64_ _) && _ _LP64_ _ //#endif //#endif #if ! defined MEMisALIGNED #define MEMisALIGNED Nan #endif #if defined ARM32 && defined SSE2 #include "sse2neon.h" #elif defined AVX || defined SSE2 //|| defined AVX2 || defined #include #endif #if __GNUC__ > 4 || \ (__GNUC__ == 4 && (__GNUC_MINOR__ > 9 || \ (__GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ >= 1))) //#define OpenMP4 1 #endif union uni32{ uint32_t vi; float f4[1]; uint32_t u32[1]; uint16_t u16[2]; uint8_t u8[4]; }; union uni64{ uint64_t vi; uint64_t u64[1]; double d8[1]; float f4[2]; uint32_t u32[2]; uint16_t u16[4]; uint8_t u8[8]; }; union uni128{ #if defined SSE2 __m128i vi; __m128d d; __m128 f; __m128d d128[1]; #endif uint64_t u64[2]; uint32_t u32[4]; uint8_t u8[16]; // __m64 m64[2]; double halfd[2], d8[2]; float halff[4], f4[4]; }; union uni256 { #if defined AVX2 __m256i vi; __m256d d; __m256 f; #endif #if defined SSE2 || defined AVX2 __m128i i128[2]; __m128d d128[2]; __m128d halfd[2]; __m128 halff[2]; #endif uint64_t u64[4]; uint32_t u32[8]; uint8_t u8[32]; // __m64 m64[4]; double d8[4]; float f4[8]; }; union uni512 { #if defined AVX512 __m512i vi; __m512d d; __m512 f; #endif #if defined AVX2 || defined AVX512 __m256i i256[2]; __m256d d256[2]; __m256d halfd[2]; __m256 halff[2]; #endif #if defined SSE2 || defined AVX2 || defined AVX512 __m128i i128[4]; __m128d d128[4]; __m128 f128[4]; #endif uint64_t u64[8]; uint32_t u32[16]; uint8_t u8[64]; // __m64 m64[4]; double d8[8]; float f4[16]; }; #define BitsPerByte 8U #if defined AVX512 #define SIMD_AVAILABILITY avx512f #define SSEBITS 512U #define SSEMODE 30U #define BlockType0 __m512i #define BlockType __m512i ALIGNED #define UnionType0 uni512 #define Double __m512d #define LOADuDOUBLE _mm512_loadu_pd #define LOADU _mm512_loadu_si512 // _mm512_lddqu_si512 #if defined MEM_IS_ALIGNED #define LOADDOUBLE _mm512_load_pd #define LOAD _mm512_load_si512 #else #define LOAD LOADU #define LOADDOUBLE LOADuDOUBLE #endif #define MAXDOUBLE _mm512_max_pd #define ADDDOUBLE _mm512_add_pd #define SUBDOUBLE _mm512_sub_pd #define MULTDOUBLE _mm512_mul_pd #define STOREuDOUBLE _mm512_storeu_pd #define ZERODOUBLE _mm512_setzero_pd #define MULTFLOAT _mm512_mul_ps #define ADDFLOAT _mm512_add_ps #define SUBFLOAT _mm512_sub_ps #define ZEROFLOAT _mm512_setzero_ps //#define BLENDFLOAT _mm256_blend_ps //#define DUPLICATEFLOAT _mm512_moveldup_ps #define MASK0ADDDOUBLE(A,M,B) _mm512_maskz_add_pd(A, M, A, B) // #define BLENDDOUBLE _mm256_blend_pd #define DUPLICATEDOUBLE _mm512_movedup_pd #define MAXINTEGER _mm512_max_epi32 #define AND _mm512_and_si512 #define OR _mm512_or_si512 #define XOR _mm512_xor_si512 #define ANY(A) (! _mm512_kortestz(_mm512_test_epi32_mask(A, A), _mm512_test_epi32_mask(A, A))) #define SHR32 _mm512_srli_epi32 // see also _mm512512_rol_epi64, #define SHL32 _mm512_slli_epi32 #define SHR16 _mm512_srli_epi16 #define SHR64 _mm512_srli_epi64 #define SHL64 _mm512_slli_epi64 #define SET16 _mm512_set1_epi16 #define SET32 _mm512_set1_epi32 #define SET64 _mm512_set1_epi64 // oder _m512d _mm512_set1_pd (double a) #define ZERO _mm512_setzero_si512 #define STORE_DOUBLE _mm512_store_pd //#define EXTRACT16 _mm512_extract_epi16 #define ADD32 _mm512_add_epi32 #define MADD16 _mm512_madd_epi16 #define ADD64 _mm512_add_epi64 #define MULT32 _mm512_mullo_epi32 #define SET8 _mm512_set1_epi8 // nicht! BW #define SETREV8( B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0) \ _mm512_set_epi8(B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0, \ B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0, \ B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0, \ B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0) #if defined AVX512BW #define ADD8 _mm512_add_epi8 #define SAD8 _mm512_sad_epu8 #define SHUFFLE8 _mm512_shuffle_epi8 #elif #define LOWER256(A) (__m256i) _mm512_extractf64x4_pd((__m512d) (A), 0) #define UPPER256(A) (__m256i) _mm512_extractf64x4_pd((__m512d) (A), 1) #define DO_256(X, A, B) \ _mm512_inserti64x4(_mm512_zextsi256_si512(X(LOWER256(A), LOWER256(B))), \ X(UPPER256(A), UPPER256(B)), 1) #define ADD8(A, B) DO_256(_mm256_add_epi8, A, B) #define SAD8(A, B) DO_256(_mm256_sad_epu8, A, B) #define SHUFFLE8 DO_256(_mm256_shuffle_epi8, A, B) #endif #elif defined AVX #define SSEBITS 256U #define SSEMODE 20U #define BlockType0 __m256i #define BlockType __m256i ALIGNED #define UnionType0 uni256 #define Double __m256d #define LOADuDOUBLE _mm256_loadu_pd #if defined MEM_IS_ALIGNED #define LOADDOUBLE _mm256_load_pd #else #define LOADDOUBLE LOADuDOUBLE #endif #define MAXDOUBLE _mm256_max_pd #define ADDDOUBLE _mm256_add_pd #define SUBDOUBLE _mm256_sub_pd #define MULTDOUBLE _mm256_mul_pd #define STOREuDOUBLE _mm256_storeu_pd #define ZERODOUBLE _mm256_setzero_pd #define MULTFLOAT _mm256_mul_ps #define ADDFLOAT _mm256_add_ps #define SUBFLOAT _mm256_sub_ps #define ZEROFLOAT _mm256_setzero_ps #define BLENDFLOAT _mm256_blend_ps #define DUPLICATEFLOAT _mm256_moveldup_ps #define MASK0ADDDOUBLE(A,M,B) _mm256_maskz_add_pd(A, M, A, B) #define BLENDDOUBLE _mm256_blend_pd #define DUPLICATEDOUBLE _mm256_movedup_pd #if defined AVX2 #define LOADU _mm256_loadu_si256 // _mm256_lddqu_si256 #if defined MEM_IS_ALIGNED #define LOAD _mm256_load_si256 // _mm256_lddqu_si256 #else #define LOAD LOADU #endif #define MAXINTEGER _mm256_max_epi32 #define AND _mm256_and_si256 #define OR _mm256_or_si256 #define XOR _mm256_xor_si256 #define ANY(A) (!_mm256_testz_si256(A, A)) #define SHR32 _mm256_srli_epi32 // see also _mm256512_rol_epi64, #define SHL32 _mm256_slli_epi32 #define SHR16 _mm256_srli_epi16 #define SHR64 _mm256_srli_epi64 #define SHL64 _mm256_slli_epi64 #define SHUFFLE8 _mm256_shuffle_epi8 #define SET8 _mm256_set1_epi8 #define SETREV8( B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0) \ _mm256_setr_epi8(B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0, \ B15,B14,B13,B12,B11,B10,B9,B8,B7,B6,B5,B4,B3,B2,B1,B0) #define SET16 _mm256_set1_epi16 #define SET32 _mm256_set1_epi32 #define SET64 _mm256_set1_epi64x // oder _m256d _mm256_set1_pd (double a) #define ZERO _mm256_setzero_si256 #define STORE_DOUBLE _mm256_store_pd #define EXTRACT16 _mm256_extract_epi16 #define ADD8 _mm256_add_epi8 #define ADD32 _mm256_add_epi32 #define MADD16 _mm256_madd_epi16 #define ADD64 _mm256_add_epi64 #define SAD8 _mm256_sad_epu8 #define MULT32 _mm256_mullo_epi32 #define SIMD_AVAILABILITY avx2 #else #define SIMD_AVAILABILITY avx #define MAXINTEGER _mm_max_epi32 #endif #elif defined SSE2 #define SSEBITS 128U #define SSEMODE 10U #define BlockType0 __m128i #define BlockType __m128i ALIGNED #define UnionType0 uni128 #define Double __m128d #define LOADU _mm_loadu_si128 #define LOADuDOUBLE _mm_loadu_pd #if defined MEM_IS_ALIGNED #define LOADDOUBLE _mm_load_pd #define LOAD _mm_load_si128 #else #define LOAD LOADU #define LOADDOUBLE LOADuDOUBLE #endif #define MAXDOUBLE _mm_max_pd #define MAXINTEGER _mm_max_epi32 #define ADDDOUBLE _mm_add_pd #define SUBDOUBLE _mm_sub_pd #define MULTDOUBLE _mm_mul_pd #define STOREuDOUBLE _mm_storeu_pd #define ZERODOUBLE _mm_setzero_pd #define MULTFLOAT _mm_mul_ps #define ADDFLOAT _mm_add_ps #define SUBFLOAT _mm_sub_ps #define ZEROFLOAT _mm_setzero_ps #define BLENDFLOAT _mm_blend_ps #define DUPLICATEFLOAT _mm_moveldup_ps #define AND _mm_and_si128 #define OR _mm_or_si128 #define XOR _mm_xor_si128 bool any128(__m128i A); #define ANY(A) any128(A) #define SHR32 _mm_srli_epi32 // see also _mm512_rol_epi64, #define SHL32 _mm_slli_epi32 #define SHR16 _mm_srli_epi16 #define SHR64 _mm_srli_epi64 #define SHL64 _mm_slli_epi64 #define SET8 _mm_set1_epi8 #define SETREV8 _mm_setr_epi8 #define SET16 _mm_set1_epi16 #define SET32 _mm_set1_epi32 #define SET64 _mm_set1_epi64x #define ZERO _mm_setzero_si128 #define STORE_DOUBLE _mm_store_pd #define EXTRACT16 _mm_extract_epi16 #define ADD8 _mm_add_epi8 #define ADD32 _mm_add_epi32 #define ADD64 _mm_add_epi64 #define MADD16 _mm_madd_epi16 #define SAD8 _mm_sad_epu8 // _pu8? #define INT2FLOAT _mm_cvtepi32_ps #define INT2DOUBLE _mm_cvtpi32_pd // very expensive #define BLENDDOUBLE _mm_blend_pd #define DUPLICATEDOUBLE _mm_movedup_pd //#define MOVEMASK _mm_movemask_ps //#define BLEND _mm_blend_pd //see also _mm512_mask_inserti64x4_mm_insert_epi64 #if defined SSSE3 // within SSE2 #define SIMD_AVAILABILITY sse2 #define SHUFFLE8 _mm_shuffle_epi8 #else #define SIMD_AVAILABILITY ssse3 #endif #elif defined MMX || defined PlainInteger64 // particularly Bit23 #define SIMD_AVAILABILITY no_sse #define SSEBITS 64U #define SSEMODE 0U #define BlockType0 uint64_t #define BlockType BlockType0 #define UnionType0 uni64 #define AND(B,C) (B) & (C) #define OR(B,C) (B) | (C) #define XOR(B,C) (B) xor (C) #define SHR64(B,C) (B) >> (C) #define SHR32 SHR64 // unsafe #define SHR16 SHR64 // unsafe #define SHL64(B,C) (B) << (C) #define SHL32 SHL64 // unsafe #define SHL16 SHL64 // unsafe #define SET32 (Ulong) 0x0000000100000001L * (Ulong) #define ADD64(B,C) (B) + (C) #define ADD32 ADD64 // unsafe #define ZERO() 0L #define LOADU(A) *(A) #define LOAD LOADU #define SET8(A) (((BlockType0) (A)) * ((BlockType0) 0x0101010101010101L)) #if defined MMX #define ADD8(B,C) (BlockType0) _mm_add_pi8((__m64) B, (__m64) C) #else #define ADD8(B,C) (((BlockType0) (B)) + ((BlockType0) (C))) // unsafe #endif #define ANY #else #define SIMD_AVAILABILITY no_sse #define SSEBITS 32U #define SSEMODE 0U #define BlockType0 uint32_t #define BlockType BlockType0 #define UnionType0 uni32 #if defined PlainInteger32 #define AND(B,C) (B) & (C) #define OR(B,C) (B) | (C) #define XOR(B,C) (B) xor (C) #define SHR32(B,C) (B) >> (C) #define SHR16 SHR32 // unsafe #define SHL32(B,C) (B) << (C) #define SHL16 SHL32 // unsafe #define SHL64 SHL32 // unsafe #define SET32 #define ADD64(B,C) (B) + (C) #define ZERO() 0U #define ANY #define LOADU(A) *(A) #define LOAD LOADU #define SET8(A) (((BlockType0) (A)) * ((BlockType0) 0x01010101U)) #define ADD8(B,C) (((BlockType0) (B)) + ((BlockType0) (C))) // unsafe ! #else #if defined __GNUC__ && defined SCHLATHERS_MACHINE #warning No specification of any SIMD. #endif #endif #endif // AVX512 .. PlaintInteger32 #if defined AVX #define SCALAR_DEFAULT SCALAR_NEARFMA #else #define SCALAR_DEFAULT SCALAR_BASE #endif #define BytesPerBlock (SSEBITS / BitsPerByte) #define ALIGNED __attribute__ ((aligned (BytesPerBlock))) #define doubles (BytesPerBlock / 8) #define integers (BytesPerBlock / 4) /////////////////////////////////////////////////////////////////////// // checks whether current hardware matches the compilation // * mainly intel (and amd) cores // * but also GPU /////////////////////////////////////////////////////////////////////// #define noMISS 0U #define noUSE 0U #define anyrelevantUSE 0U #define gpuUSE 1U #define avx2USE 2U #define avxUSE 3U #define ssse3USE 4U #define sse2USE 5U #define avx512fUSE 6U #define USEnMISS 10U #define gpuMISS 11U #define avx2MISS 12U #define avxMISS 13U #define ssse3MISS 14U #define sse2MISS 15U #define avx512fMISS 16U #define anyMISS (1 << gpuMISS) | (1 << avx2MISS) | (1 << avxMISS) | \ (1 << ssse3MISS) | (1 << sse2MISS) | (1 << avx512fMISS) #define SIMD_INFO \ allmiss | alluse | (HAS_PARALLEL || alluse != 0) * (1 << anyrelevantUSE) | \ ((HAS_PARALLEL || alluse != noUSE) && !(HAS_PARALLEL && allmiss==noMISS)) * \ (1 << USEnMISS) #if defined EAX #if EAX != 0 #define EXX_REDEFINED 1 #endif #else #define EAX 0 #endif #if defined EBX #if EBX != 1U #define EXX_REDEFINED 1 #endif #else #define EBX 1 #endif #if defined ECX #if ECX != 2 #define EXX_REDEFINED 1 #endif #else #define ECX 2 #endif #if defined EDX #if EDX != 3 #define EXX_REDEFINED 1 #endif #else #define EDX 3 #endif //#define sse3 Available(1, ECX,0) #define no_sseAvail true #define no_sseMISS 999U #define no_sseUSE 999U #define ssse3Avail Available(1, ECX,9) #define sse41Avail Available(1, ECX,19) #define avxAvail Available(1, ECX,28) #define sseAvail Available(1, EDX,25) #define sse2Avail Available(1, EDX,26) #define avx2Avail Available(7, EBX,5) #define avx512fAvail Available(7, EBX,16) #define avx512dqAvail Available(7, EBX, 17) #define avx512pfAvail Available(7, EBX,26) #define avx512erAvail Available(7, EBX,27) #define avx512cdAvail Available(7, EBX,28) #define avx512bwAvail Available(7, EBX,30) #define avx512vlAvail Available(7, EBX,31) #define avx512vbmiAvail Available(7, ECX, 1) #define avx512vmbi2Avail Available(7, ECX, 6) #define avx512vnniAvail Available(7, ECX, 11) #define avx512bitalgAvail Available(7, ECX, 12) #define avx512popcntAvail Available(7, ECX, 14) #define avx512intersectAvail Available(7, EDX, 8) #define avx512fp16Avail Available(7, EDX, 23) #define avx512bf16Avail Available(7, EAX, 5) // intel Advanced Matrix Calculations #define amxbf16Avail Available(7, EDX, 22) #define amxtileAvail Available(7, EDX, 24) #define amxint8Avail Available(7, EDX, 25) /* PRINTF("blatt %d: %u %u %u %u\n", Blatt, s[0], s[1], s[2], s[3]); \ uint32_t a = s[Register];\ for (int i=31; i>=0; i--){if (i == Bit) PRINTF(" :");PRINTF("%s", (a >> i) & 1 ? "1" : "0");if (i%4 == 0) PRINTF(" ");} PRINTF(" register=%d bit=%d %d: %d %d\n", Register, Bit, bit_SSE, s[Register] & (1 << (Bit)), (s[Register] >> Bit) & 1); \ */ #define AVAILABLE_SIMD_OK static inline bool \ Available(unsigned VARIABLE_IS_NOT_USED B, int VARIABLE_IS_NOT_USED R, \ int VARIABLE_IS_NOT_USED Bit) { return true; } #if defined EXX_REDEFINED // unknown system -- don't perform checks #define INSTALL_DEFAULT Inone #define AVAILABLE_SIMD AVAILABLE_SIMD_OK #elif defined ARM32 #define INSTALL_DEFAULT Iask #if defined CROSS_CAPACITY #error "ARM allows only CROSS=noflags and CROSS=FALSE" #elif defined REQUIRED_SIMD && REQUIRED_SIMD <= 2 #error "ARM allows CROSS=noflags and CROSS=FALSE, only." #endif #define AVAILABLE_SIMD AVAILABLE_SIMD_OK #elif defined __APPLE__ // i.e. apple but isn't arm #define INSTALL_DEFAULT Inone #if defined CROSS_CAPACITY #error "old MAC-OS allows only CROSS=noflags and CROSS=FALSE" #elif defined REQUIRED_SIMD && REQUIRED_SIMD != 3 #error "old MAC-OS allows CROSS=noflags and CROSS=FALSE, only." #endif #if defined REQUIRED_SIMD #undef REQUIRED_SIMD #endif #define AVAILABLE_SIMD AVAILABLE_SIMD_OK #elif defined WINCPUID #define INSTALL_DEFAULT Iask #define AVAILABLE_SIMD static inline bool \ Available(unsigned Blatt, int Register, int Bit) { \ uint32_t s[4]; \ __cpuid((int *)s, (int) Blatt); \ return s[Register] & (1 << (Bit)); \ } } #if ! defined MSDOS_WINDOWS #error Puzzled about the underlying system. Please contact maintainer. #endif #elif defined LINUXCPUID #define INSTALL_DEFAULT Iask #define AVAILABLE_SIMD static inline bool \ Available(unsigned Blatt, int Register, int Bit) { \ uint32_t s[4]; \ asm volatile \ ("cpuid": "=a"(s[0]), "=b"(s[1]),"=c"(s[2]), \ "=d"(s[3]):"a"(Blatt),"c"(0)); \ return s[Register] & (1 << (Bit)); \ } #elif defined MINGWCPUID #define INSTALL_DEFAULT Iask // vgl https://github.com/luzexi/MinGW/blob/master/x64/lib/gcc/x86_64-w64-mingw32/4.8.0/include/cpuid.h #if defined SCHLATHERS_MACHINE #define REACT_ON_DIFFERENT_CPUID_RESULTS \ uint32_t u[4]; \ asm volatile \ ("cpuid": "=a"(u[0]), "=b"(u[1]),"=c"(u[2]), \ "=d"(u[3]):"a"(Blatt),"c"(0)); \ PRINTF("%u %u %u %u\n%u %u %u %u\n%u %u %u %u\n", \ u[0],u[1],u[2],u[3], \ t[0],t[1],t[2],t[3], \ s[0],s[1],s[2],s[3]); \ if ((s[0] != t[0] || s[1] != t[1] || s[2] != t[2] || s[3] !=t[3])) BUG #else #define REACT_ON_DIFFERENT_CPUID_RESULTS return false #endif #define AVAILABLE_SIMD static inline bool \ Available(unsigned Blatt, int Register, int Bit) { \ unsigned int t[4]; \ if (!__get_cpuid(Blatt, t, t+1, t+2, t+3)) \ ERR1("unallowed cpuid access. %.80s", CONTACT); \ unsigned int s[4]; \ __cpuid(Blatt, s[0], s[1], s[2], s[3]); \ if ((s[0] != t[0] || s[1] != t[1] || s[2] != t[2] || s[3] != t[3])) { \ /* __get_cpuid does not seem to work for certain registers */ \ /* indeed results may differ (14 Jan 2022) ! */ \ REACT_ON_DIFFERENT_CPUID_RESULTS; } \ return s[Register] & (1 << (Bit)); \ } #else #define INSTALL_DEFAULT Inone #define AVAILABLE_SIMD static inline bool \ Available(unsigned VARIABLE_IS_NOT_USED B, int VARIABLE_IS_NOT_USED R, \ int VARIABLE_IS_NOT_USED Bit) { \ RFERROR("SIMD checks are not available on your system (on MS systems only under Visual Studio). Use 'CROSS' on Linux systems and alike."); \ return false; \ } #if defined REQUIRED_SIMD #undef REQUIRED_SIMD #endif #endif #if defined CROSS_CAPACITY #if defined REQUIRED_SIMD #define ASSERT_TEXT \ "But the CPU doesn't know about it. As 'CROSS=TRUE' has been chosen as compilation option, it was assumed that the programme was compiled on the most unskilled CPU." // ok #else #define ASSERT_TEXT \ "But the CPU doesn't know about it. As 'CROSS' has been chosen as compilation option, it was assumed that each CPU has at least the CROSS skills." #endif #elif defined REQUIRED_SIMD // ! CROSS_CAPACITY #if REQUIRED_SIMD == 0 // CROSS = nosimd without -mno-sse2 #define ASSERT_TEXT \ "This means 'without SIMD', but the compiler includes SIMD. ('CROSS=nosimd' has been chosen.)" #elif REQUIRED_SIMD == 1 // CROSS = nosimd and -mno-sse2 #define ASSERT_TEXT\ "This means'without SIMD', but the CPU requires SIMD at a higher level. Please contact the maintainer." #elif REQUIRED_SIMD == 2 // CROSS=NA #define ASSERT_TEXT \ "This means 'without SIMD'), but the compiler includes SIMD (at a higher level). ('CROSS=NA' had been chosen.)" #elif REQUIRED_SIMD == 3 // CROSS=F ALSE #if defined AVAILABLE_SIMD #undef AVAILABLE_SIMD #endif #define AVAILABLE_SIMD AVAILABLE_SIMD_OK #define ASSERT_TEXT\ "This situation is unexpected for a PC. Please contact the maintainer." #elif REQUIRED_SIMD == 4 #define ASSERT_TEXT\ "This situation is unexpected on ARM. Please contact the maintainer." #else #define ASSERT_TEXT\ "This leads to an unexpected situation. Please contact the maintainer." #endif #else // ! CROSS_CAPACITY && ! REQUIRED_SIMD #define ASSERT_TEXT\ "This situation is unexpected for a server. Please contact the maintainer." #if defined AVAILABLE_SIMD #undef AVAILABLE_SIMD #endif #define AVAILABLE_SIMD AVAILABLE_SIMD_OK #endif #define ASSERT_AVAILABILITY(V,W) if ((V##Avail)) {} else {char msg[300]; SPRINTF(msg, "The program was compiled for '%.10s%.5s%.10s. %.200s'", #V, STRCMP(#V, #W) ? " && " : "", STRCMP(#V, #W) ? #W : "", ASSERT_TEXT); RFERROR(msg);} #define ASSERT_AVAILABILITY_AUX(V,W) ASSERT_AVAILABILITY(V,W) // expands V #define ASSERT_SIMD(FILE, WHAT) \ AVAILABLE_SIMD \ Uint check_simd_##FILE() { \ ASSERT_AVAILABILITY_AUX(SIMD_AVAILABILITY,WHAT); return noMISS;}\ Uint simd_use_##FILE = WHAT##USE; \ Uint simd_miss_##FILE = WHAT##MISS #define ASSERT_SIMD_AUX(FILE, WHAT) ASSERT_SIMD(FILE, WHAT)// expands WHAT #define THIS_FILE_ANYSIMD ASSERT_SIMD_AUX(this_file, SIMD_AVAILABILITY) #define SIMD_MISS(FILE, WHAT) \ Uint check_simd_##FILE() { return 1< #endif #include "def.h" #include #include #include #include #include "AutoRandomFieldsUtils.h" #define RFERROR error #define RFERROR1(M,A) {errorstring_type E_AUX; \ SPRINTF(E_AUX, M, A); RFERROR(E_AUX);} #define RFERROR2(M,A,B) {errorstring_type E_AUX; \ SPRINTF(E_AUX, M, A,B); RFERROR(E_AUX);} #define RFERROR3(M,A,B,C) {errorstring_type E_AUX;\ SPRINTF(E_AUX, M, A,B,C); RFERROR(E_AUX);} #define RFERROR4(M,A,B,C,D) {errorstring_type E_AUX; \ SPRINTF(E_AUX, M, A,B,C,D); RFERROR(E_AUX);} #define RFERROR5(M,A,B,C,D,E) {errorstring_type E_AUX; \ SPRINTF(E_AUX, M, A,B,C,D,E); RFERROR(E_AUX);} #define RFERROR6(M,A,B,C,D,E,F) {errorstring_type E_AUX;\ SPRINTF(E_AUX, M, A,B,C,D,E,F); RFERROR(E_AUX);} #define RFERROR7(M,A,B,C,D,E,F,G) {errorstring_type E_AUX;\ SPRINTF(E_AUX, M, A,B,C,D,E,F,G); RFERROR(E_AUX);} #define MULTIMINSIZE(S) ((S) > 20)// in omp parallel in DO_PARALLEL // #define MULTIMINSIZE(S) false // #define MULTIMINSIZE(S) true typedef char name_type[][MAXCHAR]; typedef enum usr_bool { // NOTE: if more options are included, change ExtendedBoolean in // userinterface.cc of RandomFields False=false, True=true, //Exception=2, // for internal use only Nan=INT_MIN } usr_bool; #define RF_NA NA_REAL #define RF_NAN R_NaN #define RF_NEGINF R_NegInf #define RF_INF R_PosInf #define T_PI M_2_PI #define OBSOLETENAME "obsolete" #define MAXINT 2147483647 #define MININT -2147483647 #define MAXUNSIGNED (MAXINT * 2) + 1 #define INFDIM MAXINT #define INFTY INFDIM #define PIDMODULUS 1000 #define LENGTH length // to avoid the unvoluntiered use of LENGTH defined by R #define complex Rcomplex #define DOT "." #define GAUSS_RANDOM(SIGMA) rnorm(0.0, SIGMA) #define UNIFORM_RANDOM unif_rand() #define POISSON_RANDOM(x) rpois(x) #define SQRT2 M_SQRT2 #define SQRTPI M_SQRT_PI #define INVPI M_1_PI #define PIHALF M_PI_2 #define ONETHIRD 0.333333333333333333333333 #define TWOTHIRD 0.6666666666666666666666667 #define TWOPI 6.283185307179586476925286766559 #define INVLOG2 1.442695040888963 #define INVSQRTTWO 0.70710678118654752440084436210 #define INVSQRTTWOPI 0.39894228040143270286 #define SQRTTWOPI 2.5066282746310002416 #define SQRTINVLOG005 0.5777613700268771079749 //#define LOG05 -0.69314718055994528623 #define LOG3 1.0986122886681096913952452369225257046474905578227 #define LOG2 M_LN2 #define EULER_C 0.5772156649015328606065120900824024310421 #define EPSILON 0.00000000001 #define EPSILON1000 0.000000001 #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MAX(A,B) ((A) > (B) ? (A) : (B)) #define ACOS std::acos #define ASIN std::asin #define ATAN std::atan #define FMIN fmin2 #define FMAX fmax2 #define ATANH std::atanh #define ACOSH std::acosh #define ASINH std::asinh #define EXPM1 std::expm1 #define LOG1P std::log1p #define FROUND fround #define COS std::cos #define EXP std::exp #define FABS(X) std::fabs((double) X) // OK; keine Klammern um X! #if ! defined MALLOCX #define MALLOCX std::malloc #define FLOOR std::floor #define SQRT(X) std::sqrt((double) X) // OK #define CEIL(X) std::ceil((double) X) // OK; keine Klammern um X! #define FREEX std::free #endif #define LOG std::log #define POW(X, Y) R_pow((double) X, (double) Y) // OK; keine Klammern um X! #define SIGN(X) sign((double) X) // OK #define SIN std::sin #define STRCMP(A, B) std::strcmp(A, B) // OK #define STRCPY(A, B) std::strcpy(A, B) // OK #define STRLEN std::strlen #define STRNCMP(A, B, C) std::strncmp(A, B, C) // OK #define STRNCPY(A, B, N) strcopyN(A, B, N) // OK #define TAN std::tan #define MEMCOPYX std::memcpy #define MEMMOVE std::memmove #define MEMSET std::memset #define MEMCMP std::memcmp #define AALLOC std::aligned_alloc #define CALLOCX std::calloc #define SPRINTF std::sprintf // Rprint #define ROUND(X) ownround((double) X) // OK #define TRUNC(X) ftrunc((double) X) // OK; keine Klammern um X! #define QSORT std::qsort #define print NEVER_USE_print_or_PRINTF_WITHIN_PARALLEL /* // */ #if defined SCHLATHERS_MACHINE && defined DO_PARALLEL && defined OMP_H #define PRINTF if (omp_get_num_threads() > 1) { error("\n\nnever use Rprintf/PRINTF within parallel constructions!!\n\n"); } else Rprintf // OK #else #define PRINTF Rprintf #endif #define R_PRINTLEVEL 1 #define C_PRINTLEVEL 1 #define MAXERRORSTRING 1000 typedef char errorstring_type[MAXERRORSTRING]; typedef unsigned int Uint; typedef uint64_t Ulong; typedef int64_t Long; // not SCHLATHERS_MACHINE #ifndef SCHLATHERS_MACHINE #define INTERNALMSG SERR0("Sorry. This functionality doesn't exist currently. There is work in progress at the moment by the maintainer.") #if ! defined assert #define assert(X) {} #endif #define BUG { \ RFERROR4("Severe error occured in function '%.50s' (file '%.50s', line %d).%.200s", \ __FUNCTION__, __FILE__, __LINE__, CONTACT); \ } //#define MEMCOPY(A,B,C) {MEMCPY(A,B,C); printf("memcpy %.50s %d\n", __FILE__, __LINE__);} #define MEMCOPY(A,B,C) MEMCOPYX(A,B,C) #define AMALLOC(ELEMENTS, SIZE) AALLOC(SIZE, (SIZE) * (ELEMENTS)) #if ! defined MALLOC #define MALLOC MALLOCX #define FREE(X) if ((X) == NULL) {} else {FREEX(X); (X)=NULL;} #endif #define CALLOC CALLOCX #define XCALLOC CALLOCX // #define UNCONDFREE(X) {FREEX(X); (X)=NULL;} #endif // not SCHLATHERS_MACHINE // SCHLATHERS_MACHINE #ifdef SCHLATHERS_MACHINE #define MAXALLOC 1000000000L // __extension__ unterdrueckt Fehlermeldung wegen geklammerter Argumente #define INTERNALMSG { \ RFERROR4("made to be an internal function '%.50s' ('%.50s', line %d).", \ __FUNCTION__, __FILE__, __LINE__); \ } #if ! defined assert #define assert(X) if (__extension__ (X)) {} else \ RFERROR4("'assert' failed in function '%.50s' (%.50s, line %d) %.200s.", \ __FUNCTION__, __FILE__, __LINE__, CONTACT) #endif #define SHOW_ADDRESSES 1 #define BUG { RFERROR3("BUG in '%.50s' of '%.50s' at line %d.\n", __FUNCTION__, __FILE__, __LINE__);} #define MEMCOPY(A,B,C) __extension__ ({ assert((A)!=NULL && (B)!=NULL && (C)>0 && (C)<=MAXALLOC); MEMCOPYX(A,B,C); }) //#define MEMCOPY(A,B,C) memory_copy(A, B, C) #define CALLOC(X, Y) __extension__({assert((X)>0 && (Y)>0 && ((X) * (Y))0 && (Y)>0 && ((X) * (Y))0 && (X)<=MAXALLOC); MALLOCX(X);}) #define FREE(X) if ((X) == NULL) {} else {if (!SHOWFREE) {} else PRINTF("free %.50s %ld Line %d %s\n", #X, (Long) X, __LINE__, __FILE__); FREEX(X); (X)=NULL;} #endif #define UNCONDFREE(X) { if (!SHOWFREE) {} else PRINTF("(free in %s, line %d)\n", __FILE__, __LINE__); FREEX(X); (X)=NULL;} #endif // SCHLATHERS_MACHINE #if defined SCHLATHER_DEBUGGING #undef MALLOC #undef CALLOC #undef XCALLOC #define MALLOC(X) __extension__({if (!DOPRINT) {} else PRINTF("(MLLC %s, line %d)\n", __FILE__, __LINE__);assert((X)>0 && (X)<=3e9); MALLOCX(X);}) #define CALLOC(X, Y) __extension__({if (!DOPRINT) {} else PRINTF("(CLLC %s, line %d)\n",__FILE__, __LINE__);assert((X)>0 && (Y)>0 && ((X) * (Y)) 0 && (Y)>0 && ((X) * (Y)) = 7 #define FALLTHROUGH_OK __attribute__ ((fallthrough)) #else #define FALLTHROUGH_OK #endif #define UTILSINFO(M) if (!KEYT()->global_utils.basic.helpinfo) {} else PRINTF("%s\n(Note that you can unable this information by 'RFoptions(helpinfo=FALSE)'.)\n", M) // OK #ifdef DO_PARALLEL #define HAS_PARALLEL true #else #define HAS_PARALLEL false #endif #ifdef USEGPU #define HAS_GPU true #else #define HAS_GPU false #endif #ifndef GPU_NEEDS // not a proper installation #define GPU_NEEDS Inone #endif #ifdef OBSOLETE_RFU #if defined SHOW_ADDRESSES #undef SHOW_ADDRESSES #endif #if ! defined RFU_NEED_OBSOLETE #undef FALLTHROUGH_OK #undef HAS_PARALLEL #endif extern int CORES; // from RF V4 on in extern.h: #define LENMSG MAXERRORSTRING #define LENERRMSG MAXERRORSTRING #define nErrorLoc MAXERRORSTRING typedef char errorloc_type[MAXERRORSTRING]; #define utilsparam utilsoption_type #define solve_param solve_options #if defined RFdef_H #define isGLOBAL NA_INTEGER #else #define isGLOBAL false #endif #ifdef _OPENMP #ifdef SCHLATHERS_MACHINE #define DO_PARALLEL 1 #else #define DO_PARALLEL 1 #endif #else #if defined DO_PARALLEL #undef DO_PARALLEL #endif #endif // #define LOCAL_MSG char MSG[LENERRMSG] #ifdef DO_PARALLEL #define LOCAL_ERRMSG2 char MSG2[LENERRMSG] #else // not DO_PARALLEL #define LOCAL_ERRMSG2 #endif //#if defined ERR //#undef ERR //#endif #ifndef ERR #define ERR ERR0 #endif #else // NOT OBSOLETE #if ! defined USE_FC_LEN_T #define USE_FC_LEN_T #endif #define ATAN2 std::atan2 #define COSH std::cosh #define SINH std::sinh #define TANH std::tanh #endif #if ! defined NA_LONG #define NA_LONG (-1L - (Long) 9223372036854775807) #endif #define FREE0(PT, WHICH) { \ FREE(PT->WHICH); PT->n_##WHICH= 0;} \ if (PT->WHICH != NULL) { \ UNCONDFREE(PT->WHICH); \ PT->n_##WHICH = 0; \ } else assert(PT->n_##WHICH==0); #endif RandomFieldsUtils/inst/include/xport_import.h0000644000176200001440000000116614227157055021175 0ustar liggesusers #ifndef RandomFieldsUtilsxport_H #define RandomFieldsUtilsxport_H 1 typedef struct KEY_type KEY_type; struct KEY_type { KEY_type *next; utilsoption_type global_utils; int pid, visitingpid; bool ok, doshow; errorstring_type error_location; int *ToIntDummy; int ToIntN, ToRealN ; double *ToRealDummy; double loggamma1old, nu1old, loggamma2old, nu2old, loggamma_old,nuOld, gamma, nuAlt; }; extern KEY_type *PIDKEY[PIDMODULUS]; KEY_type *KEYT(); typedef struct option_type option_type; utilsoption_type *WhichOptionList(bool local); extern const char *R_TYPE_NAMES[LAST_R_TYPE_NAME + 1]; #endif RandomFieldsUtils/inst/include/zzz_calls.h0000644000176200001440000002274114227157055020444 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef rfutils_calls_H #define rfutils_calls_H 1 /* in xport_import.cc of calling packages set #ifdefine ERROR_RFU_CALLS 1 #include "xport_import.h" ... // #define CALL(what) Ext_##what = (what##_type) R_GetCCallable(importfrom, #what) #define CALL(what) Ext_##what = #what_err; see RandomFields, for instance #in clude */ #ifdef ERROR_RFU_CALLS #define RFU_ERRCALL0(TYPE, FCTN) \ static TYPE FCTN##_err(){char msg[300]; SPRINTF(msg, "calling %.50s", #N); RFERROR(msg); } #define RFU_ERRCALL(TYPE, FCTN, ...) \ static TYPE FCTN##_err(__VA_ARGS__) { char msg[300]; SPRINTF(msg, "calling %.50s", #N); RFERROR(msg);} #else #define RFU_ERRCALL0(TYPE, FCTN) #define RFU_ERRCALL(TYPE, FCTN, ...) #endif #define DECLARE0(TYPE, FCTN) \ typedef TYPE (*FCTN##_type)(); \ attribute_hidden TYPE RU_##FCTN(); \ TYPE FCTN(); \ RFU_ERRCALL0(TYPE, FCTN) #define DECLARE1(TYPE, FCTN, A) \ typedef TYPE (*FCTN##_type)(A); \ attribute_hidden TYPE RU_##FCTN(A); \ TYPE FCTN(A); \ RFU_ERRCALL(TYPE, FCTN, A) #define DECLARE2(TYPE, FCTN, A, B) \ typedef TYPE (*FCTN##_type)(A, B); \ attribute_hidden TYPE RU_##FCTN(A, B); \ TYPE FCTN(A, B); \ RFU_ERRCALL(TYPE, FCTN, A, B) #define DECLARE3(TYPE, FCTN, A, B, C) \ typedef TYPE (*FCTN##_type)(A, B, C); \ attribute_hidden TYPE RU_##FCTN(A, B, C); \ TYPE FCTN(A, B, C);\ RFU_ERRCALL(TYPE, FCTN, A, B, C) #define DECLARE4(TYPE, FCTN, A, B, C, D) \ typedef TYPE (*FCTN##_type)(A, B, C, D); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D); \ TYPE FCTN(A, B, C, D); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D) #define DECLARE5(TYPE, FCTN, A, B, C, D, E) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E); \ TYPE FCTN(A, B, C, D, E); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E) #define DECLARE6(TYPE, FCTN, A, B, C, D, E, F) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F); \ TYPE FCTN(A, B, C, D, E, F); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F) #define DECLARE7(TYPE, FCTN, A, B, C, D, E, F, G) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G); \ TYPE FCTN(A, B, C, D, E, F, G); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G) #define DECLARE8(TYPE, FCTN, A, B, C, D, E, F, G, H) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H); \ TYPE FCTN(A, B, C, D, E, F, G, H); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H) #define DECLARE9(TYPE, FCTN, A, B, C, D, E, F, G, H, I) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H, I); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H, I); \ TYPE FCTN(A, B, C, D, E, F, G, H, I); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H, I) #define DECLARE10(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H, I, J); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H, I, J); \ TYPE FCTN(A, B, C, D, E, F, G, H, I, J); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J) #define DECLARE11(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H, I, J, K); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H, I, J, K); \ TYPE FCTN(A, B, C, D, E, F, G, H, I, J, K); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K) #define DECLARE12(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K, L) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H, I, J, K, L); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H, I, J, K, L); \ TYPE FCTN(A, B, C, D, E, F, G, H, I, J, K, L); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K, L) #define DECLARE13(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K, L, M) \ typedef TYPE (*FCTN##_type)(A, B, C, D, E, F, G, H, I, J, K, L, M); \ attribute_hidden TYPE RU_##FCTN(A, B, C, D, E, F, G, H, I, J, K, L, M); \ TYPE FCTN(A, B, C, D, E, F, G, H, I, J, K, L, M); \ RFU_ERRCALL(TYPE, FCTN, A, B, C, D, E, F, G, H, I, J, K, L, M) #define DECLARE14(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N) #define DECLARE15(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O) #define DECLARE16(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P) #define DECLARE17(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q) #define DECLARE18(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R) #define DECLARE19(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R, S) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R, S); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R, S); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R, S); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N, O, P, Q, R, S) #define DECLARE20(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T) #define DECLARE21(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U) #define DECLARE22(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V) #define DECLARE23(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W) #define DECLARE24(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X) #define DECLARE25(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y) #define DECLARE26(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y, Z) \ typedef TYPE (*FCTN##_type)(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y, Z); \ attribute_hidden TYPE RU_##FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y, Z); \ TYPE FCTN(A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y, Z); \ RFU_ERRCALL(TYPE, FCTN, A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V, W, X, Y, Z) #endif RandomFieldsUtils/inst/include/AutoRandomFieldsUtils.h0000644000176200001440000000101714227157055022643 0ustar liggesusers #ifndef auto_rfutils_h #define auto_rfutils_h 1 #include "AutoRandomFieldsUtilsLocal.h" #define MAXUNITS 4 #define MAXCHAR 18 // max number of characters for (covariance) names #define RFOPTIONS "RFoptions" #define CLASS_TRYERROR "try-error" #define WARN_UNKNOWN_OPTION_ALL 4 #define WARN_UNKNOWN_OPTION_SINGLE 3 #define WARN_UNKNOWN_OPTION_CAPITAL 2 #define WARN_UNKNOWN_OPTION_NONE1 1 #define WARN_UNKNOWN_OPTION_NONE 0 #define CONTACT " Please contact the maintainer martin.schlather@math.uni-mannheim.de.\n" #endif RandomFieldsUtils/inst/include/parallel_base.h0000644000176200001440000001231514227157055021213 0ustar liggesusers/* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2021 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef parallel_omp_base_H #define parallel_omp_base_H 1 // NEVER_** is used by parallel_**, so no conflict with NO_, which is used // in the programmes. Except that there is no differences between // NEVER_ and NO_ // #define NEVER_OMP 1 // #define NEVER_AVX 1 // #define NEVER_SSE 1 #if defined WIN32 || defined _WIN32 || defined __WIN32__ #define MSDOS_WINDOWS 1 #elif defined MSDOS_WINDOWS #undef MSDOS_WINDOWS #endif #if defined __x86_64 || defined __x86_64__ || defined __amd64__ || defined __amd64 || defined _M_X64 #define X86_64 1 #elif defined X86_64 #undef X86_64 #endif #if defined __arm64__ || defined __arm64 || defined __aarch64__ #define ARM64 1 #elif defined ARM64 #undef ARM64 #endif #if defined __arm32__ || defined __arm__ || defined ARM64 #define ARM32 1 #define NO_AVX 1 #endif #if defined __ARM_NEON__ || defined __aarch64__ || defined _M_ARM || defined _M_ARM64 #define NEON 1 #elif defined NEON #undef NEON #endif #if defined MSDOS_WINDOWS || defined (__APPLE__) || defined(__sun) #if defined TIME_AVAILABLE #undef TIME_AVAILABLE #endif #else #define TIME_AVAILABLE 1 #endif #if defined _OPENMP && ! defined NO_OMP && ! defined NEVER_OMP && ! defined ARM32 && ! defined __APPLE__ // 15 Jan 2022 #if defined SCHLATHERS_MACHINE #define DO_PARALLEL 1 // may change value when debugging #else #define DO_PARALLEL 1// never changes value #endif #elif defined DO_PARALLEL #undef DO_PARALLEL #endif #if defined NEVER_SSE #ifndef NO_SSE2 #define NO_SSE2 1 #endif #elif defined NEVER_AVX #ifndef NO_AVX #define NO_AVX 1 #endif #elif defined NEVER_AVX512 #ifndef NO_AVX512 #define NO_AVX512 1 #endif #endif #if defined NO_SSE2 && ! defined NO_SSE3 #define NO_SSE3 1 #endif #if defined NO_SSE3 && ! defined NO_SSSE3 #define NO_SSSE3 1 #endif #if defined NO_SSSE3 && ! defined NO_SSE41 #define NO_SSE41 1 #endif #if defined NO_SSE41 && ! defined NO_AVX #define NO_AVX 1 #endif #if defined NO_AVX && ! defined NO_AVX2 #define NO_AVX2 1 #endif #if defined NO_AVX2 && ! defined NO_AVX512 #define NO_AVX512 1 #endif #if ! defined NO_AVX512 #if ! defined DO_AVX512BITALG && ! defined DO_AVX512BW && ! defined DO_AVX512CD && ! defined DO_AVX512DQ && ! defined DO_AVX512ER && ! defined DO_AVX512F && ! defined DO_AVX512IFMA && ! defined DO_AVX512PF && ! defined DO_AVX512VBMI && ! defined DO_AVX512VL && ! defined DO_AVX512VPOPCNTDQ && ! defined DO_AVX5124FMAPS && ! defined DO_AVX5124VNNIW #define DO_AVX512BITALG 1 #define DO_AVX512BW 1 #define DO_AVX512CD 1 #define DO_AVX512DQ 1 #define DO_AVX512ER 1 #define DO_AVX512F 1 #define DO_AVX512IFMA 1 #define DO_AVX512PF 1 #define DO_AVX512VBMI 1 #define DO_AVX512VL 1 #define DO_AVX512VPOPCNTDQ 1 #define DO_AVX5124FMAPS 1 #define DO_AVX5124VNNIW 1 #endif #if defined __AVX512BITALG__ && defined DO_AVX512BITALG #define AVX512BITALG 1 #endif #if defined __AVX512BW__ && defined DO_AVX512BW #define AVX512BW 1 #endif #if defined __AVX512CD__ && defined DO_AVX512CD #define AVX512CD 1 #endif #if defined __AVX512DQ__ && defined DO_AVX512DQ #define AVX512DQ 1 #endif #if defined __AVX512ER__ && defined DO_AVX512ER #define AVX512ER 1 #endif #if defined __AVX512F__ && defined DO_AVX512F #define AVX512F 1 #define AVX512 1 #endif #if defined __AVX512IFMA__ && defined DO_AVX512IFMA #define AVX512IFMA 1 #endif #if defined __AVX512PF__ && defined DO_AVX512PF #define AVX512PF 1 #endif #if defined __AVX512VBMI__ && defined DO_AVX512VBMI #define AVX512VBMI 1 #endif #if defined __AVX512VL__ && defined DO_AVX512VL #define AVX512VL 1 // #endif #if defined __AVX512VPOPCNTDQ__ && defined DO_AVX512VPOPCNTDQ #define AVX512VPOPCNTDQ 1 // #endif #if defined __AVX5124FMAPS__ && defined DO_AVX5124FMAPS #define AVX5124FMAPS 1 #endif #if defined __AVX5124VNNIW__ && defined DO_AVX5124VNNIW #define AVX5124VNNIW 1 #endif #endif // end ! no 512 #if defined __AVX2__ && ! defined NO_AVX2 #define AVX2 1 #elif defined AVX2 #undef AVX2 #endif #if defined __AVX__ && ! defined NO_AVX #define AVX 1 #elif defined AVX #undef AVX #endif #if (defined __SSE41__ || defined NEON) && ! defined NO_SSE41 #define SSE41 1 #elif defined SSE41 #undef SSE41 #endif #if (defined __SSSE3__ || defined NEON) && ! defined NO_SSSE3 #define SSSE3 1 #elif defined SSSE3 #undef SSSE3 #endif #if (defined __SSE3__ || defined NEON) && ! defined NO_SSE3 #define SSE3 1 #elif defined SSE3 #undef SSE3 #endif #if (defined __SSE2__ || defined NEON) && ! defined NO_SSE2 #define SSE2 1 #elif defined SSE2 #undef SSE2 #endif #endif RandomFieldsUtils/inst/include/sse2neon.h0000644000176200001440000123307014227157055020165 0ustar liggesusers#ifndef SSE2NEON_H #define SSE2NEON_H #include #include #include #include #if ! defined MALLOCX #define SQRT(X) std::sqrt((double) X) // OK #define FLOOR std::floor #define CEIL(X) std::ceil((double) X) // OK; keine Klammern um X! #define MALLOCX std::malloc #define FREEX std::free #define MALLOC MALLOCX #define FREE(X) if ((X) == NULL) {} else {FREEX(X); (X)=NULL;} #endif #define BUG_SSE2NEON error("Severe error occured in sse2neon. Please contact schlather@math.uni-mannheim.de") #define ZERO_SSE2NEON _mm_setzero_si128 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // !!!!!!!!!!!!! NOTE THAT THIS FILE HAS BEEN MODIFIED !!!!!!!!!!!!!!! // !!!!!!!!!!!!! AND SHOULD NOT BE DISTRIBUTED OUTSIDE !!!!!!!!!!!!!!! // !!!!!!!!!!!!! THIS R PACKAGE !!!!!!!!!!!!!!! // !!!!!!!!!!!!! 13 Jan 2022, Martin Schlather !!!!!!!!!!!!!!! // !!!!!!!!!!!!! 27 Mar 2022, Martin Schlather !!!!!!!!!!!!!!! // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // This header file provides a simple API translation layer // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions // // This header file does not yet translate all of the SSE intrinsics. // // Contributors to this work are: // John W. Ratcliff // Brandon Rowlett // Ken Fast // Eric van Beurden // Alexander Potylitsin // Hasindu Gamaarachchi // Jim Huang // Mark Cheng // Malcolm James MacLeod // Devin Hussey (easyaspi314) // Sebastian Pop // Developer Ecosystem Engineering // Danila Kutenin // François Turban (JishinMaster) // Pei-Hsuan Hung // Yang-Hao Yuan // Syoyo Fujita // Brecht Van Lommel /* * sse2neon is freely redistributable under the MIT License. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* Tunable configurations */ /* Enable precise implementation of math operations * This would slow down the computation a bit, but gives consistent result with * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result) */ /* _mm_min|max_ps|ss|pd|sd */ #ifndef SSE2NEON_PRECISE_MINMAX #define SSE2NEON_PRECISE_MINMAX (0) #endif /* _mm_rcp_ps and _mm_div_ps */ #ifndef SSE2NEON_PRECISE_DIV #define SSE2NEON_PRECISE_DIV (0) #endif /* _mm_sqrt_ps and _mm_rsqrt_ps */ #ifndef SSE2NEON_PRECISE_SQRT #define SSE2NEON_PRECISE_SQRT (0) #endif /* _mm_dp_pd */ #ifndef SSE2NEON_PRECISE_DP #define SSE2NEON_PRECISE_DP (0) #endif /* compiler specific definitions */ #if defined(__GNUC__) || defined(__clang__) #pragma push_macro("FORCE_INLINE") #pragma push_macro("ALIGN_STRUCT") #define FORCE_INLINE static inline __attribute__((always_inline)) #define ALIGN_STRUCT(x) __attribute__((aligned(x))) #define _sse2neon_likely(x) __builtin_expect(!!(x), 1) #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) #else /* non-GNU / non-clang compilers */ #warning "Macro name collisions may happen with unsupported compiler." #ifndef FORCE_INLINE #define FORCE_INLINE static inline #endif #ifndef ALIGN_STRUCT #define ALIGN_STRUCT(x) __declspec(align(x)) #endif #define _sse2neon_likely(x) (x) #define _sse2neon_unlikely(x) (x) #endif //#include //#include /* Architecture-specific build options */ /* FIXME: #pragma GCC push_options is only available on GCC */ #if defined(__GNUC__) #if defined(__arm__) && __ARM_ARCH == 7 /* According to ARM C Language Extensions Architecture specification, * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) * architecture supported. */ #if !defined(__ARM_NEON) || !defined(__ARM_NEON__) #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." #endif #if !defined(__clang__) #pragma GCC push_options #pragma GCC target("fpu=neon") #endif #elif defined(__aarch64__) #if !defined(__clang__) #pragma GCC push_options #pragma GCC target("+simd") #endif #else #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." #endif #endif #include /* Rounding functions require either Aarch64 instructions or libm failback */ #if !defined(__aarch64__) #include #endif /* "__has_builtin" can be used to query support for built-in functions * provided by gcc/clang and other compilers that support it. */ #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ /* Compatibility with gcc <= 9 */ #if defined(__GNUC__) && (__GNUC__ <= 9) #define __has_builtin(x) HAS##x #define HAS__builtin_popcount 1 #define HAS__builtin_popcountll 1 #else #define __has_builtin(x) 0 #endif #endif /** * MACRO for shuffle parameter for _mm_shuffle_ps(). * Argument fp3 is a digit[0123] that represents the fp from argument "b" * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same * for fp2 in result. fp1 is a digit[0123] that represents the fp from * argument "a" of mm_shuffle_ps that will be places in fp1 of result. * fp0 is the same for fp0 of result. */ #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) /* Rounding mode macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 #define _MM_FROUND_TO_POS_INF 0x02 #define _MM_FROUND_TO_ZERO 0x03 #define _MM_FROUND_CUR_DIRECTION 0x04 #define _MM_FROUND_NO_EXC 0x08 #define _MM_FROUND_RAISE_EXC 0x00 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) #define _MM_ROUND_NEAREST 0x0000 #define _MM_ROUND_DOWN 0x2000 #define _MM_ROUND_UP 0x4000 #define _MM_ROUND_TOWARD_ZERO 0x6000 /* Flush zero mode macros. */ #define _MM_FLUSH_ZERO_MASK 0x8000 #define _MM_FLUSH_ZERO_ON 0x8000 #define _MM_FLUSH_ZERO_OFF 0x0000 /* Denormals are zeros mode macros. */ #define _MM_DENORMALS_ZERO_MASK 0x0040 #define _MM_DENORMALS_ZERO_ON 0x0040 #define _MM_DENORMALS_ZERO_OFF 0x0000 /* indicate immediate constant argument in a given range */ #define __constrange(a, b) const /* A few intrinsics accept traditional data types like ints or floats, but * most operate on data types that are specific to SSE. * If a vector type ends in d, it contains doubles, and if it does not have * a suffix, it contains floats. An integer vector type can contain any type * of integer, from chars to shorts to unsigned long longs. */ typedef int64x1_t __m64; typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ // On ARM 32-bit architecture, the float64x2_t is not supported. // The data type __m128d should be represented in a different way for related // intrinsic conversion. #if defined(__aarch64__) typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ #else typedef float32x4_t __m128d; #endif typedef int64x2_t __m128i; /* 128-bit vector containing integers */ // __int64 is defined in the Intrinsics Guide which maps to different datatype // in different data model #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) #if (defined(__x86_64__) || defined(__i386__)) #define __int64 long long #else #define __int64 int64_t #endif #endif /* type-safe casting between types */ #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) #define vreinterpretq_m128_f32(x) (x) #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) #define vreinterpretq_f32_m128(x) (x) #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) #define vreinterpretq_m128i_s64(x) (x) #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) #define vreinterpretq_s64_m128i(x) (x) #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) #define vreinterpret_m64_s64(x) (x) #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) #define vreinterpret_s64_m64(x) (x) #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) #if defined(__aarch64__) #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) #define vreinterpretq_m128d_f64(x) (x) #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) #define vreinterpretq_f64_m128d(x) (x) #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) #else #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) #define vreinterpretq_m128d_f32(x) (x) #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) #define vreinterpretq_f32_m128d(x) (x) #endif // A struct is defined in this header file called 'SIMDVec' which can be used // by applications which attempt to access the contents of an __m128 struct // directly. It is important to note that accessing the __m128 struct directly // is bad coding practice by Microsoft: @see: // https://docs.microsoft.com/en-us/cpp/cpp/m128 // // However, some legacy source code may try to access the contents of an __m128 // struct directly so the developer can use the SIMDVec as an alias for it. Any // casting must be done manually by the developer, as you cannot cast or // otherwise alias the base NEON data type for intrinsic operations. // // union intended to allow direct access to an __m128 variable using the names // that the MSVC compiler provides. This union should really only be used when // trying to access the members of the vector as integer values. GCC/clang // allow native access to the float members through a simple array access // operator (in C since 4.6, in C++ since 4.8). // // Ideally direct accesses to SIMD vectors should not be used since it can cause // a performance hit. If it really is needed however, the original __m128 // variable can be aliased with a pointer to this union and used to access // individual components. The use of this union should be hidden behind a macro // that is used throughout the codebase to access the members instead of always // declaring this type of variable. typedef union ALIGN_STRUCT(16) SIMDVec { float m128_f32[4]; // as floats - DON'T USE. Added for convenience. int8_t m128_i8[16]; // as signed 8-bit integers. int16_t m128_i16[8]; // as signed 16-bit integers. int32_t m128_i32[4]; // as signed 32-bit integers. int64_t m128_i64[2]; // as signed 64-bit integers. uint8_t m128_u8[16]; // as unsigned 8-bit integers. uint16_t m128_u16[8]; // as unsigned 16-bit integers. uint32_t m128_u32[4]; // as unsigned 32-bit integers. uint64_t m128_u64[2]; // as unsigned 64-bit integers. } SIMDVec; // casting using SIMDVec #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) /* SSE macros */ #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode // Function declaration // SSE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(); FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); FORCE_INLINE __m128 _mm_set_ps1(float); FORCE_INLINE __m128 _mm_setzero_ps(void); // SSE2 FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i); FORCE_INLINE __m128i _mm_castps_si128(__m128); FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i); FORCE_INLINE __m128i _mm_cvtps_epi32(__m128); FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d); FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i); FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int); FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t); FORCE_INLINE __m128d _mm_set_pd(double, double); FORCE_INLINE __m128i _mm_set1_epi32(int); FORCE_INLINE __m128i _mm_setzero_si128(); // SSE4.1 FORCE_INLINE __m128d _mm_ceil_pd(__m128d); FORCE_INLINE __m128 _mm_ceil_ps(__m128); FORCE_INLINE __m128d _mm_floor_pd(__m128d); FORCE_INLINE __m128 _mm_floor_ps(__m128); FORCE_INLINE __m128d _mm_round_pd(__m128d, int); FORCE_INLINE __m128 _mm_round_ps(__m128, int); // SSE4.2 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); /* Backwards compatibility for compilers with lack of specific type support */ // Older gcc does not define vld1q_u8_x4 type #if defined(__GNUC__) && !defined(__clang__) && \ ((__GNUC__ <= 10 && defined(__arm__)) || \ (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ (__GNUC__ <= 9 && defined(__aarch64__))) FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { uint8x16x4_t ret; ret.val[0] = vld1q_u8(p + 0); ret.val[1] = vld1q_u8(p + 16); ret.val[2] = vld1q_u8(p + 32); ret.val[3] = vld1q_u8(p + 48); return ret; } #else // Wraps vld1q_u8_x4 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { return vld1q_u8_x4(p); } #endif /* Function Naming Conventions * The naming convention of SSE intrinsics is straightforward. A generic SSE * intrinsic function is given as follows: * _mm__ * * The parts of this format are given as follows: * 1. describes the operation performed by the intrinsic * 2. identifies the data type of the function's primary arguments * * This last part, , is a little complicated. It identifies the * content of the input values, and can be set to any of the following values: * + ps - vectors contain floats (ps stands for packed single-precision) * + pd - vectors cantain doubles (pd stands for packed double-precision) * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit * signed integers * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit * unsigned integers * + si128 - unspecified 128-bit vector or 256-bit vector * + m128/m128i/m128d - identifies input vector types when they are different * than the type of the returned vector * * For example, _mm_setzero_ps. The _mm implies that the function returns * a 128-bit vector. The _ps at the end implies that the argument vectors * contain floats. * * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); * // Set packed 8-bit integers * // 128 bits, 16 chars, per 8 bits * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, * 4, 5, 12, 13, 6, 7, 14, 15); * // Shuffle packed 8-bit integers * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb * * Data (Number, Binary, Byte Index): +------+------+-------------+------+------+-------------+ | 1 | 2 | 3 | 4 | Number +------+------+------+------+------+------+------+------+ | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary +------+------+------+------+------+------+------+------+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index +------+------+------+------+------+------+------+------+ +------+------+------+------+------+------+------+------+ | 5 | 6 | 7 | 8 | Number +------+------+------+------+------+------+------+------+ | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary +------+------+------+------+------+------+------+------+ | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index +------+------+------+------+------+------+------+------+ * Index (Byte Index): +------+------+------+------+------+------+------+------+ | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | +------+------+------+------+------+------+------+------+ +------+------+------+------+------+------+------+------+ | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | +------+------+------+------+------+------+------+------+ * Result: +------+------+------+------+------+------+------+------+ | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index +------+------+------+------+------+------+------+------+ | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary +------+------+------+------+------+------+------+------+ | 256 | 2 | 5 | 6 | Number +------+------+------+------+------+------+------+------+ +------+------+------+------+------+------+------+------+ | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index +------+------+------+------+------+------+------+------+ | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary +------+------+------+------+------+------+------+------+ | 3 | 7 | 4 | 8 | Number +------+------+------+------+------+------+-------------+ */ /* Constants for use with _mm_prefetch. */ enum _mm_hint { _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ _MM_HINT_T1 = 2, /* load data to L2 cache only */ _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */ _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */ _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */ _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */ }; // The bit field mapping to the FPCR(floating-point control register) typedef struct { uint16_t res0; uint8_t res1 : 6; uint8_t bit22 : 1; uint8_t bit23 : 1; uint8_t bit24 : 1; uint8_t res2 : 7; #if defined(__aarch64__) uint32_t res3; #endif } fpcr_bitfield; // Takes the upper 64 bits of a and places it in the low end of the result // Takes the lower 64 bits of b and places it into the high end of the result. FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); } // takes the lower two 32-bit values from a and swaps them and places in high // end of result takes the higher two 32 bit values from b and swaps them and // places in low end of result. FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); } FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) { float32x2_t a21 = vget_high_f32( vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); float32x2_t b03 = vget_low_f32( vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); } FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) { float32x2_t a03 = vget_low_f32( vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); float32x2_t b21 = vget_high_f32( vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); } FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); } FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); } FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); } // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the // high FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); } FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) { float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); } FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) { float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); } FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) { float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t b22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); } FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) { float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); } FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) { float32x2_t a33 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); } FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); } FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32_t b2 = vgetq_lane_f32(b, 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); } FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32_t b2 = vgetq_lane_f32(b, 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); } // Kahan summation for accurate summation of floating-point numbers. // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y) { y -= *c; float t = *sum + y; *c = (t - *sum) - y; *sum = t; } #if defined(__ARM_FEATURE_CRYPTO) // Wraps vmull_p64 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); return vreinterpretq_u64_p128(vmull_p64(a, b)); } #else // ARMv7 polyfill // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. // // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a // 64-bit->128-bit polynomial multiply. // // It needs some work and is somewhat slow, but it is still faster than all // known scalar methods. // // Algorithm adapted to C from // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted // from "Fast Software Polynomial Multiplication on ARM Processors Using the // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab // (https://hal.inria.fr/hal-01506572) static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly8x8_t a = vreinterpret_p8_u64(_a); poly8x8_t b = vreinterpret_p8_u64(_b); // Masks uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff)); uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000)); // Do the multiplies, rotating with vext to get all combinations uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 // Add cross products uint8x16_t l = veorq_u8(e, f); // L = E + F uint8x16_t m = veorq_u8(g, h); // M = G + H uint8x16_t n = veorq_u8(i, j); // N = I + J // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL // instructions. #if defined(__aarch64__) uint8x16_t lm_p0 = vreinterpretq_u8_u64( vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); uint8x16_t lm_p1 = vreinterpretq_u8_u64( vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); uint8x16_t nk_p0 = vreinterpretq_u8_u64( vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); uint8x16_t nk_p1 = vreinterpretq_u8_u64( vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); #else uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); #endif // t0 = (L) (P0 + P1) << 8 // t1 = (M) (P2 + P3) << 16 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); // t2 = (N) (P4 + P5) << 24 // t3 = (K) (P6 + P7) << 32 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); // De-interleave #if defined(__aarch64__) uint8x16_t t0 = vreinterpretq_u8_u64( vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); uint8x16_t t1 = vreinterpretq_u8_u64( vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); uint8x16_t t2 = vreinterpretq_u8_u64( vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); uint8x16_t t3 = vreinterpretq_u8_u64( vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); #else uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); #endif // Shift the cross products uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 // Accumulate the products uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); uint8x16_t mix = veorq_u8(d, cross1); uint8x16_t r = veorq_u8(mix, cross2); return vreinterpretq_u64_u8(r); } #endif // ARMv7 polyfill // C equivalent: // __m128i _mm_shuffle_epi32_default(__m128i a, // __constrange(0, 255) int imm) { // __m128i ret; // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; // return ret; // } #define _mm_shuffle_epi32_default(a, imm) \ __extension__({ \ int32x4_t ret; \ ret = vmovq_n_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ ret = vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ ret, 1); \ ret = vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ ret, 2); \ ret = vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ ret, 3); \ vreinterpretq_m128i_s32(ret); \ }) // Takes the upper 64 bits of a and places it in the low end of the result // Takes the lower 64 bits of a and places it into the high end of the result. FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) { int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); } // takes the lower two 32-bit values from a and swaps them and places in low end // of result takes the higher two 32 bit values from a and swaps them and places // in high end of result. FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); } // rotates the least significant 32 bits into the most significant 32 bits, and // shifts the rest down FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) { return vreinterpretq_m128i_s32( vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); } // rotates the most significant 32 bits into the least significant 32 bits, and // shifts the rest up FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) { return vreinterpretq_m128i_s32( vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); } // gets the lower 64 bits of a, and places it in the upper 64 bits // gets the lower 64 bits of a and places it in the lower 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) { int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); } // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); } // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and // places it in the lower 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); } FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) { int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); } FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) { int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); } FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) { int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); } // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) // int imm) #if defined(__aarch64__) #define _mm_shuffle_epi32_splat(a, imm) \ __extension__({ \ vreinterpretq_m128i_s32( \ vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ }) #else #define _mm_shuffle_epi32_splat(a, imm) \ __extension__({ \ vreinterpretq_m128i_s32( \ vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ }) #endif // NEON does not support a general purpose permute intrinsic // Selects four specific single-precision, floating-point values from a and b, // based on the mask i. // // C equivalent: // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, // __constrange(0, 255) int imm) { // __m128 ret; // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; // return ret; // } // // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx #define _mm_shuffle_ps_default(a, b, imm) \ __extension__({ \ float32x4_t ret; \ ret = vmovq_n_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ ret = vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ ret, 1); \ ret = vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ ret, 2); \ ret = vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ ret, 3); \ vreinterpretq_m128_f32(ret); \ }) // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified // by imm. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, // __constrange(0,255) int // imm) #define _mm_shufflelo_epi16_function(a, imm) \ __extension__({ \ int16x8_t ret = vreinterpretq_s16_m128i(a); \ int16x4_t lowBits = vget_low_s16(ret); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ 1); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ 2); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ 3); \ vreinterpretq_m128i_s16(ret); \ }) // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified // by imm. // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, // __constrange(0,255) int // imm) #define _mm_shufflehi_epi16_function(a, imm) \ __extension__({ \ int16x8_t ret = vreinterpretq_s16_m128i(a); \ int16x4_t highBits = vget_high_s16(ret); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ 5); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ 6); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ 7); \ vreinterpretq_m128i_s16(ret); \ }) /* MMX */ //_mm_empty is a no-op on arm FORCE_INLINE void _mm_empty(void) {} /* SSE */ // Adds the four single-precision, floating-point values of a and b. // // r0 := a0 + b0 // r1 := a1 + b1 // r2 := a2 + b2 // r3 := a3 + b3 // // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // adds the scalar single-precision floating point values of a and b. // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) { float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); // the upper values in the result must be the remnants of . return vreinterpretq_m128_f32(vaddq_f32(a, value)); } // Computes the bitwise AND of the four single-precision, floating-point values // of a and b. // // r0 := a0 & b0 // r1 := a1 & b1 // r2 := a2 & b2 // r3 := a3 & b3 // // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } // Computes the bitwise AND-NOT of the four single-precision, floating-point // values of a and b. // // r0 := ~a0 & b0 // r1 := ~a1 & b1 // r2 := ~a2 & b2 // r3 := ~a3 & b3 // // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vbicq_s32(vreinterpretq_s32_m128(b), vreinterpretq_s32_m128(a))); // *NOTE* argument swap } // Average packed unsigned 16-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) { return vreinterpret_m64_u16( vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); } // Average packed unsigned 8-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Compares for equality. // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for equality. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); } // Compares for greater than or equal. // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for greater than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpge_ps(a, b)); } // Compares for greater than. // // r0 := (a0 > b0) ? 0xffffffff : 0x0 // r1 := (a1 > b1) ? 0xffffffff : 0x0 // r2 := (a2 > b2) ? 0xffffffff : 0x0 // r3 := (a3 > b3) ? 0xffffffff : 0x0 // // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for greater than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); } // Compares for less than or equal. // // r0 := (a0 <= b0) ? 0xffffffff : 0x0 // r1 := (a1 <= b1) ? 0xffffffff : 0x0 // r2 := (a2 <= b2) ? 0xffffffff : 0x0 // r3 := (a3 <= b3) ? 0xffffffff : 0x0 // // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for less than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmple_ps(a, b)); } // Compares for less than // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for less than // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmplt_ps(a, b)); } // Compares for inequality. // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for inequality. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); } // Compares for not greater than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for not greater than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpnge_ps(a, b)); } // Compares for not greater than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for not greater than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpngt_ps(a, b)); } // Compares for not less than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for not less than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpnle_ps(a, b)); } // Compares for not less than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for not less than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpnlt_ps(a, b)); } // Compares the four 32-bit floats in a and b to check if any values are NaN. // Ordered compare between each value returns true for "orderable" and false for // "not orderable" (NaN). // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see // also: // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) { // Note: NEON does not have ordered compare builtin // Need to compare a eq a and b eq b to check for NaN // Do AND of results to get final uint32x4_t ceqaa = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t ceqbb = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); } // Compares for ordered. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpord_ps(a, b)); } // Compares for unordered. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) { uint32x4_t f32a = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t f32b = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); } // Compares for unordered. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); } // Compares the lower single-precision floating point scalar values of a and b // using an equality operation. : // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) { uint32x4_t a_eq_b = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_eq_b, 0) & 0x1; } // Compares the lower single-precision floating point scalar values of a and b // using a greater than or equal operation. : // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) { uint32x4_t a_ge_b = vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_ge_b, 0) & 0x1; } // Compares the lower single-precision floating point scalar values of a and b // using a greater than operation. : // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) { uint32x4_t a_gt_b = vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_gt_b, 0) & 0x1; } // Compares the lower single-precision floating point scalar values of a and b // using a less than or equal operation. : // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) { uint32x4_t a_le_b = vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_le_b, 0) & 0x1; } // Compares the lower single-precision floating point scalar values of a and b // using a less than operation. : // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important // note!! The documentation on MSDN is incorrect! If either of the values is a // NAN the docs say you will get a one, but in fact, it will return a zero!! FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) { uint32x4_t a_lt_b = vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_lt_b, 0) & 0x1; } // Compares the lower single-precision floating point scalar values of a and b // using an inequality operation. : // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) { return !_mm_comieq_ss(a, b); } // Convert packed signed 32-bit integers in b to packed single-precision // (32-bit) floating-point elements, store the results in the lower 2 elements // of dst, and copy the upper 2 packed elements from a to the upper elements of // dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[63:32] := Convert_Int32_To_FP32(b[63:32]) // dst[95:64] := a[95:64] // dst[127:96] := a[127:96] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) { return vreinterpretq_m128_f32( vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), vget_high_f32(vreinterpretq_f32_m128(a)))); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) { #if defined(__aarch64__) return vreinterpret_m64_s32( vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))))); #else return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION))))); #endif } // Convert the signed 32-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) { return vreinterpretq_m128_f32( vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si FORCE_INLINE int _mm_cvt_ss2si(__m128 a) { #if defined(__aarch64__) return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))), 0); #else float32_t data = vgetq_lane_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); return (int32_t) data; #endif } // Convert packed 16-bit integers in a to packed single-precision (32-bit) // floating-point elements, and store the results in dst. // // FOR j := 0 to 3 // i := j*16 // m := j*32 // dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) { return vreinterpretq_m128_f32( vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); } // Convert packed 32-bit integers in b to packed single-precision (32-bit) // floating-point elements, store the results in the lower 2 elements of dst, // and copy the upper 2 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[63:32] := Convert_Int32_To_FP32(b[63:32]) // dst[95:64] := a[95:64] // dst[127:96] := a[127:96] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) { return vreinterpretq_m128_f32( vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), vget_high_f32(vreinterpretq_f32_m128(a)))); } // Convert packed signed 32-bit integers in a to packed single-precision // (32-bit) floating-point elements, store the results in the lower 2 elements // of dst, then convert the packed signed 32-bit integers in b to // single-precision (32-bit) floating-point element, and store the results in // the upper 2 elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(a[31:0]) // dst[63:32] := Convert_Int32_To_FP32(a[63:32]) // dst[95:64] := Convert_Int32_To_FP32(b[31:0]) // dst[127:96] := Convert_Int32_To_FP32(b[63:32]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) { return vreinterpretq_m128_f32(vcvtq_f32_s32( vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); } // Convert the lower packed 8-bit integers in a to packed single-precision // (32-bit) floating-point elements, and store the results in dst. // // FOR j := 0 to 3 // i := j*8 // m := j*32 // dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) { return vreinterpretq_m128_f32(vcvtq_f32_s32( vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 16-bit integers, and store the results in dst. Note: this intrinsic // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and // 0x7FFFFFFF. // // FOR j := 0 to 3 // i := 16*j // k := 32*j // IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF) // dst[i+15:i] := 0x7FFF // ELSE // dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) { const __m128 i16Min = _mm_set_ps1((float) INT16_MIN); const __m128 i16Max = _mm_set_ps1((float) INT16_MAX); const __m128 i32Max = _mm_set_ps1((float) INT32_MAX); const __m128i maxMask = _mm_castps_si128( _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max))); const __m128i betweenMask = _mm_castps_si128( _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max))); const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask), _mm_setzero_si128()); __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX)); __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN)); __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a)); __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt); return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32))); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) // Convert packed single-precision (32-bit) floating-point elements in a to // packed 8-bit integers, and store the results in lower 4 elements of dst. // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values // between 0x7F and 0x7FFFFFFF. // // FOR j := 0 to 3 // i := 8*j // k := 32*j // IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF) // dst[i+7:i] := 0x7F // ELSE // dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8 FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a) { const __m128 i8Min = _mm_set_ps1((float) INT8_MIN); const __m128 i8Max = _mm_set_ps1((float) INT8_MAX); const __m128 i32Max = _mm_set_ps1((float) INT32_MAX); const __m128i maxMask = _mm_castps_si128( _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max))); const __m128i betweenMask = _mm_castps_si128( _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max))); const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask), _mm_setzero_si128()); __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX)); __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN)); __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a)); __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt); int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32)); int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16)); static const uint32_t bitMask[2] = {0xFFFFFFFF, 0}; int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask)); return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0))); } // Convert packed unsigned 16-bit integers in a to packed single-precision // (32-bit) floating-point elements, and store the results in dst. // // FOR j := 0 to 3 // i := j*16 // m := j*32 // dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) { return vreinterpretq_m128_f32( vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); } // Convert the lower packed unsigned 8-bit integers in a to packed // single-precision (32-bit) floating-point elements, and store the results in // dst. // // FOR j := 0 to 3 // i := j*8 // m := j*32 // dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) { return vreinterpretq_m128_f32(vcvtq_f32_u32( vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); } // Convert the signed 32-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) // Convert the signed 64-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int64_To_FP32(b[63:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) { return vreinterpretq_m128_f32( vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); } // Copy the lower single-precision (32-bit) floating-point element of a to dst. // // dst[31:0] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 FORCE_INLINE float _mm_cvtss_f32(__m128 a) { return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // // dst[31:0] := Convert_FP32_To_Int32(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 64-bit integer, and store the result in dst. // // dst[63:0] := Convert_FP32_To_Int64(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) { #if defined(__aarch64__) return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0); #else float32_t data = vgetq_lane_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); return (int64_t) data; #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) { return vreinterpret_m64_s32( vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) { return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) { return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); } // Divides the four single-precision, floating-point values of a and b. // // r0 := a0 / b0 // r1 := a1 / b1 // r2 := a2 / b2 // r3 := a3 / b3 // // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) { #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV return vreinterpretq_m128_f32( vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); #if SSE2NEON_PRECISE_DIV // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); #endif return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); #endif } // Divides the scalar single-precision floating point value of a by b. // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Extract a 16-bit integer from a, selected with imm8, and store the result in // the lower element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16 #define _mm_extract_pi16(a, imm) \ (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) // Free aligned memory that was allocated with _mm_malloc. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free FORCE_INLINE void _mm_free(void *addr) { FREE(addr); } // Macro: Get the flush zero bits from the MXCSR control and status register. // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or // _MM_FLUSH_ZERO_OFF // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode() { union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF; } // Macro: Get the rounding mode bits from the MXCSR control and status register. // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE() { union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif if (r.field.bit22) { return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; } else { return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; } } // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16 #define _mm_insert_pi16(a, b, imm) \ __extension__({ \ vreinterpret_m64_s16( \ vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ }) // Loads four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx FORCE_INLINE __m128 _mm_load_ps(const float *p) { return vreinterpretq_m128_f32(vld1q_f32(p)); } // Load a single-precision (32-bit) floating-point element from memory into all // elements of dst. // // dst[31:0] := MEM[mem_addr+31:mem_addr] // dst[63:32] := MEM[mem_addr+31:mem_addr] // dst[95:64] := MEM[mem_addr+31:mem_addr] // dst[127:96] := MEM[mem_addr+31:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 #define _mm_load_ps1 _mm_load1_ps // Loads an single - precision, floating - point value into the low word and // clears the upper three words. // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_load_ss(const float *p) { return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); } // Loads a single single-precision, floating-point value, copying it into all // four words // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx FORCE_INLINE __m128 _mm_load1_ps(const float *p) { return vreinterpretq_m128_f32(vld1q_dup_f32(p)); } // Sets the upper two single-precision, floating-point values with 64 // bits of data loaded from the address p; the lower two values are passed // through from a. // // r0 := a0 // r1 := a1 // r2 := *p0 // r3 := *p1 // // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) { return vreinterpretq_m128_f32( vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); } // Sets the lower two single-precision, floating-point values with 64 // bits of data loaded from the address p; the upper two values are passed // through from a. // // Return Value // r0 := *p0 // r1 := *p1 // r2 := a2 // r3 := a3 // // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) { return vreinterpretq_m128_f32( vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); } // Load 4 single-precision (32-bit) floating-point elements from memory into dst // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // dst[31:0] := MEM[mem_addr+127:mem_addr+96] // dst[63:32] := MEM[mem_addr+95:mem_addr+64] // dst[95:64] := MEM[mem_addr+63:mem_addr+32] // dst[127:96] := MEM[mem_addr+31:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps FORCE_INLINE __m128 _mm_loadr_ps(const float *p) { float32x4_t v = vrev64q_f32(vld1q_f32(p)); return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); } // Loads four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_loadu_ps(const float *p) { // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are // equivalent for neon return vreinterpretq_m128_f32(vld1q_f32(p)); } // Load unaligned 16-bit integer from memory into the first element of dst. // // dst[15:0] := MEM[mem_addr+15:mem_addr] // dst[MAX:16] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 FORCE_INLINE __m128i _mm_loadu_si16(const void *p) { return vreinterpretq_m128i_s16( vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); } // Load unaligned 64-bit integer from memory into the first element of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[MAX:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 FORCE_INLINE __m128i _mm_loadu_si64(const void *p) { return vreinterpretq_m128i_s64( vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); } // Allocate aligned blocks of memory. // https://software.intel.com/en-us/ // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks FORCE_INLINE void *_mm_malloc(size_t size, size_t align) { void *ptr; if (align == 1) return MALLOC(size); if (align == 2 || (sizeof(void *) == 8 && align == 4)) align = sizeof(void *); if (!posix_memalign(&ptr, align, size)) return ptr; return NULL; } // Conditionally store 8-bit integer elements from a into memory using mask // (elements are not stored when the highest bit is not set in the corresponding // element) and a non-temporal memory hint. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64 FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) { int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7); __m128 b = _mm_load_ps((const float *) mem_addr); int8x8_t masked = vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a), vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b)))); vst1_s8((int8_t *) mem_addr, masked); } // Conditionally store 8-bit integer elements from a into memory using mask // (elements are not stored when the highest bit is not set in the corresponding // element) and a non-temporal memory hint. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) // Compare packed signed 16-bit integers in a and b, and store packed maximum // values in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Computes the maximums of the four single-precision, floating-point values of // a and b. // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) { #if SSE2NEON_PRECISE_MINMAX float32x4_t _a = vreinterpretq_f32_m128(a); float32x4_t _b = vreinterpretq_f32_m128(b); return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b)); #else return vreinterpretq_m128_f32( vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #endif } // Compare packed unsigned 8-bit integers in a and b, and store packed maximum // values in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Computes the maximum of the two lower scalar single-precision floating point // values of a and b. // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Compare packed signed 16-bit integers in a and b, and store packed minimum // values in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Computes the minima of the four single-precision, floating-point values of a // and b. // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) { #if SSE2NEON_PRECISE_MINMAX float32x4_t _a = vreinterpretq_f32_m128(a); float32x4_t _b = vreinterpretq_f32_m128(b); return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b)); #else return vreinterpretq_m128_f32( vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #endif } // Compare packed unsigned 8-bit integers in a and b, and store packed minimum // values in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Computes the minimum of the two lower scalar single-precision floating point // values of a and b. // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Sets the low word to the single-precision, floating-point value of b // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), vreinterpretq_f32_m128(a), 0)); } // Moves the upper two values of B into the lower two values of A. // // r3 := a3 // r2 := a2 // r1 := b3 // r0 := b2 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B)); return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); } // Moves the lower two values of B into the upper two values of A. // // r3 := b1 // r2 := b0 // r1 := a1 // r0 := a0 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); } // Create mask from the most significant bit of each 8-bit element in a, and // store the result in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8 FORCE_INLINE int _mm_movemask_pi8(__m64 a) { uint8x8_t input = vreinterpret_u8_m64(a); #if defined(__aarch64__) static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7}; uint8x8_t tmp = vshr_n_u8(input, 7); return vaddv_u8(vshl_u8(tmp, shift)); #else // Refer the implementation of `_mm_movemask_epi8` uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7)); uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); uint8x8_t paired32 = vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14)); return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4); #endif } // NEON does not provide this method // Creates a 4-bit mask from the most significant bits of the four // single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx FORCE_INLINE int _mm_movemask_ps(__m128 a) { uint32x4_t input = vreinterpretq_u32_m128(a); #if defined(__aarch64__) static const int32x4_t shift = {0, 1, 2, 3}; uint32x4_t tmp = vshrq_n_u32(input, 31); return vaddvq_u32(vshlq_u32(tmp, shift)); #else // Uses the exact same method as _mm_movemask_epi8, see that for details. // Shift out everything but the sign bits with a 32-bit unsigned shift // right. uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); // Merge the two pairs together with a 64-bit unsigned shift right + add. uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); // Extract the result. return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); #endif } // Multiplies the four single-precision, floating-point values of a and b. // // r0 := a0 * b0 // r1 := a1 * b1 // r2 := a2 * b2 // r3 := a3 * b3 // // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Multiply the lower single-precision (32-bit) floating-point element in a and // b, store the result in the lower element of dst, and copy the upper 3 packed // elements from a to the upper elements of dst. // // dst[31:0] := a[31:0] * b[31:0] // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_mul_ps(a, b)); } // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) { return vreinterpret_m64_u16(vshrn_n_u32( vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); } // Computes the bitwise OR of the four single-precision, floating-point values // of a and b. // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } // Average packed unsigned 8-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb #define _m_pavgb(a, b) _mm_avg_pu8(a, b) // Average packed unsigned 16-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw #define _m_pavgw(a, b) _mm_avg_pu16(a, b) // Extract a 16-bit integer from a, selected with imm8, and store the result in // the lower element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) // Compare packed signed 16-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw #define _m_pmaxsw(a, b) _mm_max_pi16(a, b) // Compare packed unsigned 8-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub #define _m_pmaxub(a, b) _mm_max_pu8(a, b) // Compare packed signed 16-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw #define _m_pminsw(a, b) _mm_min_pi16(a, b) // Compare packed unsigned 8-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub #define _m_pminub(a, b) _mm_min_pu8(a, b) // Create mask from the most significant bit of each 8-bit element in a, and // store the result in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb #define _m_pmovmskb(a) _mm_movemask_pi8(a) // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) // Loads one cache line of data from address p to a location closer to the // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx FORCE_INLINE void _mm_prefetch(const void *p, int i) { (void) i; __builtin_prefetch(p); } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce four // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw #define _m_psadbw(a, b) _mm_sad_pu8(a, b) // Shuffle 16-bit integers in a using the control in imm8, and store the results // in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm) // Compute the approximate reciprocal of packed single-precision (32-bit) // floating-point elements in a, and store the results in dst. The maximum // relative error for this approximation is less than 1.5*2^-12. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) { float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); #if SSE2NEON_PRECISE_DIV // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); #endif return vreinterpretq_m128_f32(recip); } // Compute the approximate reciprocal of the lower single-precision (32-bit) // floating-point element in a, store the result in the lower element of dst, // and copy the upper 3 packed elements from a to the upper elements of dst. The // maximum relative error for this approximation is less than 1.5*2^-12. // // dst[31:0] := (1.0 / a[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) { return _mm_move_ss(a, _mm_rcp_ps(a)); } // Computes the approximations of the reciprocal square roots of the four // single-precision floating point values of in. // The current precision is 1% error. // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) { float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); #if SSE2NEON_PRECISE_SQRT // Additional Netwon-Raphson iteration for accuracy out = vmulq_f32( out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); out = vmulq_f32( out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); #endif return vreinterpretq_m128_f32(out); } // Compute the approximate reciprocal square root of the lower single-precision // (32-bit) floating-point element in a, store the result in the lower element // of dst, and copy the upper 3 packed elements from a to the upper elements of // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) { return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce four // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) { uint64x1_t t = vpaddl_u32(vpaddl_u16( vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))))); return vreinterpret_m64_u16( vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0)); } // Macro: Set the flush zero bits of the MXCSR control and status register to // the value in unsigned 32-bit integer a. The flush zero may contain any of the // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON; #if defined(__aarch64__) __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */ #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } // Sets the four single-precision, floating-point values to the four inputs. // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) { float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Sets the four single-precision, floating-point values to w. // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx FORCE_INLINE __m128 _mm_set_ps1(float _w) { return vreinterpretq_m128_f32(vdupq_n_f32(_w)); } // Macro: Set the rounding mode bits of the MXCSR control and status register to // the value in unsigned 32-bit integer a. The rounding mode may contain any of // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, // _MM_ROUND_TOWARD_ZERO // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif switch (rounding) { case _MM_ROUND_TOWARD_ZERO: r.field.bit22 = 1; r.field.bit23 = 1; break; case _MM_ROUND_DOWN: r.field.bit22 = 0; r.field.bit23 = 1; break; case _MM_ROUND_UP: r.field.bit22 = 1; r.field.bit23 = 0; break; default: //_MM_ROUND_NEAREST r.field.bit22 = 0; r.field.bit23 = 0; } #if defined(__aarch64__) __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */ #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } // Copy single-precision (32-bit) floating-point element a to the lower element // of dst, and zero the upper 3 elements. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss FORCE_INLINE __m128 _mm_set_ss(float a) { float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Sets the four single-precision, floating-point values to w. // // r0 := r1 := r2 := r3 := w // // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx FORCE_INLINE __m128 _mm_set1_ps(float _w) { return vreinterpretq_m128_f32(vdupq_n_f32(_w)); } // FIXME: _mm_setcsr() implementation supports changing the rounding mode only. FORCE_INLINE void _mm_setcsr(unsigned int a) { _MM_SET_ROUNDING_MODE(a); } // FIXME: _mm_getcsr() implementation supports reading the rounding mode only. FORCE_INLINE unsigned int _mm_getcsr() { return _MM_GET_ROUNDING_MODE(); } // Sets the four single-precision, floating-point values to the four inputs in // reverse order. // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) { float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Clears the four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx FORCE_INLINE __m128 _mm_setzero_ps(void) { return vreinterpretq_m128_f32(vdupq_n_f32(0)); } // Shuffle 16-bit integers in a using the control in imm8, and store the results // in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16 #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_pi16(a, imm) \ __extension__({ \ vreinterpret_m64_s16(__builtin_shufflevector( \ vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \ }) #else #define _mm_shuffle_pi16(a, imm) \ __extension__({ \ int16x4_t ret; \ ret = \ vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \ ret = vset_lane_s16( \ vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \ 1); \ ret = vset_lane_s16( \ vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \ 2); \ ret = vset_lane_s16( \ vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \ 3); \ vreinterpret_m64_s16(ret); \ }) #endif // Guarantees that every preceding store is globally visible before any // subsequent store. // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx FORCE_INLINE void _mm_sfence(void) { __sync_synchronize(); } // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) // int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_ps(a, b, imm) \ __extension__({ \ float32x4_t _input1 = vreinterpretq_f32_m128(a); \ float32x4_t _input2 = vreinterpretq_f32_m128(b); \ float32x4_t _shuf = __builtin_shufflevector( \ _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ vreinterpretq_m128_f32(_shuf); \ }) #else // generic #define _mm_shuffle_ps(a, b, imm) \ __extension__({ \ __m128 ret; \ switch (imm) { \ case _MM_SHUFFLE(1, 0, 3, 2): \ ret = _mm_shuffle_ps_1032((a), (b)); \ break; \ case _MM_SHUFFLE(2, 3, 0, 1): \ ret = _mm_shuffle_ps_2301((a), (b)); \ break; \ case _MM_SHUFFLE(0, 3, 2, 1): \ ret = _mm_shuffle_ps_0321((a), (b)); \ break; \ case _MM_SHUFFLE(2, 1, 0, 3): \ ret = _mm_shuffle_ps_2103((a), (b)); \ break; \ case _MM_SHUFFLE(1, 0, 1, 0): \ ret = _mm_movelh_ps((a), (b)); \ break; \ case _MM_SHUFFLE(1, 0, 0, 1): \ ret = _mm_shuffle_ps_1001((a), (b)); \ break; \ case _MM_SHUFFLE(0, 1, 0, 1): \ ret = _mm_shuffle_ps_0101((a), (b)); \ break; \ case _MM_SHUFFLE(3, 2, 1, 0): \ ret = _mm_shuffle_ps_3210((a), (b)); \ break; \ case _MM_SHUFFLE(0, 0, 1, 1): \ ret = _mm_shuffle_ps_0011((a), (b)); \ break; \ case _MM_SHUFFLE(0, 0, 2, 2): \ ret = _mm_shuffle_ps_0022((a), (b)); \ break; \ case _MM_SHUFFLE(2, 2, 0, 0): \ ret = _mm_shuffle_ps_2200((a), (b)); \ break; \ case _MM_SHUFFLE(3, 2, 0, 2): \ ret = _mm_shuffle_ps_3202((a), (b)); \ break; \ case _MM_SHUFFLE(3, 2, 3, 2): \ ret = _mm_movehl_ps((b), (a)); \ break; \ case _MM_SHUFFLE(1, 1, 3, 3): \ ret = _mm_shuffle_ps_1133((a), (b)); \ break; \ case _MM_SHUFFLE(2, 0, 1, 0): \ ret = _mm_shuffle_ps_2010((a), (b)); \ break; \ case _MM_SHUFFLE(2, 0, 0, 1): \ ret = _mm_shuffle_ps_2001((a), (b)); \ break; \ case _MM_SHUFFLE(2, 0, 3, 2): \ ret = _mm_shuffle_ps_2032((a), (b)); \ break; \ default: \ ret = _mm_shuffle_ps_default((a), (b), (imm)); \ break; \ } \ ret; \ }) #endif // Computes the approximations of square roots of the four single-precision, // floating-point values of a. First computes reciprocal square roots and then // reciprocals of the four values. // // r0 := SQRT(a0) // r1 := SQRT(a1) // r2 := SQRT(a2) // r3 := SQRT(a3) // // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) { #if SSE2NEON_PRECISE_SQRT float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); // Test for vrsqrteq_f32(0) -> positive infinity case. // Change to zero, so that s * 1/SQRT(s) result is zero too. const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); const uint32x4_t div_by_zero = vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); recip = vreinterpretq_f32_u32( vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32( vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), recip); recip = vmulq_f32( vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), recip); // SQRT(s) = s * 1/SQRT(s) return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); #elif defined(__aarch64__) return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); #else float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); float32x4_t sq = vrecpeq_f32(recipsq); return vreinterpretq_m128_f32(sq); #endif } // Computes the approximation of the square root of the scalar single-precision // floating point value of in. // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) { float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); } // Stores four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx FORCE_INLINE void _mm_store_ps(float *p, __m128 a) { vst1q_f32(p, vreinterpretq_f32_m128(a)); } // Store the lower single-precision (32-bit) floating-point element from a into // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // // MEM[mem_addr+31:mem_addr] := a[31:0] // MEM[mem_addr+63:mem_addr+32] := a[31:0] // MEM[mem_addr+95:mem_addr+64] := a[31:0] // MEM[mem_addr+127:mem_addr+96] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) { float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); vst1q_f32(p, vdupq_n_f32(a0)); } // Stores the lower single - precision, floating - point value. // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx FORCE_INLINE void _mm_store_ss(float *p, __m128 a) { vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); } // Store the lower single-precision (32-bit) floating-point element from a into // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // // MEM[mem_addr+31:mem_addr] := a[31:0] // MEM[mem_addr+63:mem_addr+32] := a[31:0] // MEM[mem_addr+95:mem_addr+64] := a[31:0] // MEM[mem_addr+127:mem_addr+96] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps #define _mm_store1_ps _mm_store_ps1 // Stores the upper two single-precision, floating-point values of a to the // address p. // // *p0 := a2 // *p1 := a3 // // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) { *p = vreinterpret_m64_f32(vget_high_f32(a)); } // Stores the lower two single-precision floating point values of a to the // address p. // // *p0 := a0 // *p1 := a1 // // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) { *p = vreinterpret_m64_f32(vget_low_f32(a)); } // Store 4 single-precision (32-bit) floating-point elements from a into memory // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // MEM[mem_addr+31:mem_addr] := a[127:96] // MEM[mem_addr+63:mem_addr+32] := a[95:64] // MEM[mem_addr+95:mem_addr+64] := a[63:32] // MEM[mem_addr+127:mem_addr+96] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) { float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); float32x4_t rev = vextq_f32(tmp, tmp, 2); vst1q_f32(p, rev); } // Stores four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) { vst1q_f32(p, vreinterpretq_f32_m128(a)); } // Stores 16-bits of integer data a at the address p. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16 FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) { vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0); } // Stores 64-bits of integer data a at the address p. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64 FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) { vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0); } // Store 64-bits of integer data from a into memory using a non-temporal memory // hint. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) { vst1_s64((int64_t *) p, vreinterpret_s64_m64(a)); } // Store 128-bits (composed of 4 packed single-precision (32-bit) floating- // point elements) from a into memory using a non-temporal memory hint. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, (float32x4_t *) p); #else vst1q_f32(p, vreinterpretq_f32_m128(a)); #endif } // Subtracts the four single-precision, floating-point values of a and b. // // r0 := a0 - b0 // r1 := a1 - b1 // r2 := a2 - b2 // r3 := a3 - b3 // // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Subtract the lower single-precision (32-bit) floating-point element in b from // the lower single-precision (32-bit) floating-point element in a, store the // result in the lower element of dst, and copy the upper 3 packed elements from // a to the upper elements of dst. // // dst[31:0] := a[31:0] - b[31:0] // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_sub_ps(a, b)); } // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the // transposed matrix in these vectors (row0 now contains column 0, etc.). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ do { \ float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ vget_low_f32(ROW23.val[0])); \ row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ vget_low_f32(ROW23.val[1])); \ row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ vget_high_f32(ROW23.val[0])); \ row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ vget_high_f32(ROW23.val[1])); \ } while (0) // according to the documentation, these intrinsics behave the same as the // non-'u' versions. We'll just alias them here. #define _mm_ucomieq_ss _mm_comieq_ss #define _mm_ucomige_ss _mm_comige_ss #define _mm_ucomigt_ss _mm_comigt_ss #define _mm_ucomile_ss _mm_comile_ss #define _mm_ucomilt_ss _mm_comilt_ss #define _mm_ucomineq_ss _mm_comineq_ss // Return vector of type __m128i with undefined elements. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128 FORCE_INLINE __m128i _mm_undefined_si128(void) { #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic push //#pragma GCC diagnostic ignored "-Wuninitialized" #endif // __m128i a; __m128i a = ZERO_SSE2NEON(); BUG_SSE2NEON; return a; #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic pop #endif } // Return vector of type __m128 with undefined elements. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps FORCE_INLINE __m128 _mm_undefined_ps(void) { #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic push //#pragma GCC diagnostic ignored "-Wuninitialized" #endif // __m128 a; __m128 a = (__m128) ZERO_SSE2NEON(); BUG_SSE2NEON; return a; #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic pop #endif } // Selects and interleaves the upper two single-precision, floating-point values // from a and b. // // r0 := a2 // r1 := b2 // r2 := a3 // r3 := b3 // // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) { #if defined(__aarch64__) return vreinterpretq_m128_f32( vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); float32x2x2_t result = vzip_f32(a1, b1); return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); #endif } // Selects and interleaves the lower two single-precision, floating-point values // from a and b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) { #if defined(__aarch64__) return vreinterpretq_m128_f32( vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); float32x2x2_t result = vzip_f32(a1, b1); return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); #endif } // Computes bitwise EXOR (exclusive-or) of the four single-precision, // floating-point values of a and b. // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } /* SSE2 */ // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or // unsigned 16-bit integers in b. // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or // unsigned 32-bit integers in b. // // r0 := a0 + b0 // r1 := a1 + b1 // r2 := a2 + b2 // r3 := a3 + b3 // // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or // unsigned 32-bit integers in b. // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) { return vreinterpretq_m128i_s64( vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); } // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or // unsigned 8-bit integers in b. // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Add packed double-precision (64-bit) floating-point elements in a and b, and // store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] + db[0]; c[1] = da[1] + db[1]; return vld1q_f32((float32_t *) c); #endif } // Add the lower double-precision (64-bit) floating-point element in a and b, // store the result in the lower element of dst, and copy the upper element from // a to the upper element of dst. // // dst[63:0] := a[63:0] + b[63:0] // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_add_pd(a, b)); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] + db[0]; c[1] = da[1]; return vld1q_f32((float32_t *) c); #endif } // Add 64-bit integers a and b, and store the result in dst. // // dst[63:0] := a[63:0] + b[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) { return vreinterpret_m64_s64( vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); } // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b // and saturates. // // r0 := SignedSaturate(a0 + b0) // r1 := SignedSaturate(a1 + b1) // ... // r7 := SignedSaturate(a7 + b7) // // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Add packed signed 8-bit integers in a and b using saturation, and store the // results in dst. // // FOR j := 0 to 15 // i := j*8 // dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Add packed unsigned 16-bit integers in a and b using saturation, and store // the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in // b and saturates.. // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Compute the bitwise AND of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := a[i+63:i] AND b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in // b. // // r := a & b // // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compute the bitwise NOT of packed double-precision (64-bit) floating-point // elements in a and then AND with b, and store the results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) { // *NOTE* argument swap return vreinterpretq_m128d_s64( vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); } // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the // 128-bit value in a. // // r := (~a) & b // // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vbicq_s32(vreinterpretq_s32_m128i(b), vreinterpretq_s32_m128i(a))); // *NOTE* argument swap } // Computes the average of the 8 unsigned 16-bit integers in a and the 8 // unsigned 16-bit integers in b and rounds. // // r0 := (a0 + b0) / 2 // r1 := (a1 + b1) / 2 // ... // r7 := (a7 + b7) / 2 // // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) { return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); } // Computes the average of the 16 unsigned 8-bit integers in a and the 16 // unsigned 8-bit integers in b and rounds. // // r0 := (a0 + b0) / 2 // r1 := (a1 + b1) / 2 // ... // r15 := (a15 + b15) / 2 // // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Shift a left by imm8 bytes while shifting in zeros, and store the results in // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) // Shift a right by imm8 bytes while shifting in zeros, and store the results in // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) // Cast vector of type __m128d to type __m128. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) { return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); } // Cast vector of type __m128d to type __m128i. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) { return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); } // Cast vector of type __m128 to type __m128d. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd FORCE_INLINE __m128d _mm_castps_pd(__m128 a) { return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); } // Applies a type cast to reinterpret four 32-bit floating point values passed // in as a 128-bit parameter as packed 32-bit integers. // https://msdn.microsoft.com/en-us/library/bb514099.aspx FORCE_INLINE __m128i _mm_castps_si128(__m128 a) { return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); } // Cast vector of type __m128i to type __m128d. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); #else return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); #endif } // Applies a type cast to reinterpret four 32-bit integers passed in as a // 128-bit parameter as packed 32-bit floating point values. // https://msdn.microsoft.com/en-us/library/bb514029.aspx FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) { return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); } // Cache line containing p is flushed and invalidated from all caches in the // coherency domain. : // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx FORCE_INLINE void _mm_clflush(void const *p) { (void) p; // no corollary for Neon? } // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or // unsigned 16-bit integers in b for equality. // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compare packed 32-bit integers in a and b for equality, and store the results // in dst FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or // unsigned 8-bit integers in b for equality. // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for equality, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for equality, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpeq_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for greater-than-or-equal, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for greater-than-or-equal, store the result in the lower element of dst, // and copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmpge_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers // in b for greater than. // // r0 := (a0 > b0) ? 0xffff : 0x0 // r1 := (a1 > b1) ? 0xffff : 0x0 // ... // r7 := (a7 > b7) ? 0xffff : 0x0 // // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers // in b for greater than. // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers // in b for greater than. // // r0 := (a0 > b0) ? 0xff : 0x0 // r1 := (a1 > b1) ? 0xff : 0x0 // ... // r15 := (a15 > b15) ? 0xff : 0x0 // // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for greater-than, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for greater-than, store the result in the lower element of dst, and copy // the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed double-precision (64-bit) floating-point elements in a and b // for less-than-or-equal, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for less-than-or-equal, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmple_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers // in b for less than. // // r0 := (a0 < b0) ? 0xffff : 0x0 // r1 := (a1 < b1) ? 0xffff : 0x0 // ... // r7 := (a7 < b7) ? 0xffff : 0x0 // // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers // in b for less than. // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers // in b for lesser than. // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for less-than, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for less-than, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmplt_pd(a, b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-equal, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); #else // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped))); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-equal, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpneq_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-greater-than-or-equal, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64(veorq_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-greater-than-or-equal, store the result in the lower element of // dst, and copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpnge_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-greater-than, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64(veorq_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-greater-than, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpngt_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-less-than-or-equal, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64(veorq_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-less-than-or-equal, store the result in the lower element of dst, // and copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpnle_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-less-than, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64(veorq_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-less-than, store the result in the lower element of dst, and copy // the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpnlt_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // to see if neither is NaN, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) { #if defined(__aarch64__) // Excluding NaNs, any two floating point numbers can be compared. uint64x2_t not_nan_a = vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); uint64x2_t not_nan_b = vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = ((*(double *) &a1) == (*(double *) &a1) && (*(double *) &b1) == (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b to see if neither is NaN, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmpord_pd(a, b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed double-precision (64-bit) floating-point elements in a and b // to see if either is NaN, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) { #if defined(__aarch64__) // Two NaNs are not equal in comparison operation. uint64x2_t not_nan_a = vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); uint64x2_t not_nan_b = vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_s32( vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? UINT64_C(0) : ~UINT64_C(0); d[1] = ((*(double *) &a1) == (*(double *) &a1) && (*(double *) &b1) == (*(double *) &b1)) ? UINT64_C(0) : ~UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b to see if either is NaN, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? UINT64_C(0) : ~UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for greater-than-or-equal, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 >= *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for greater-than, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 > *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for less-than-or-equal, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 <= *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for less-than, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 < *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for equality, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1; #else uint32x4_t a_not_nan = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a)); uint32x4_t b_not_nan = vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b)); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_eq_b = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan), vreinterpretq_u64_u32(a_eq_b)); return vgetq_lane_u64(and_results, 0) & 0x1; #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for not-equal, and return the boolean result (0 or 1). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) { return !_mm_comieq_sd(a, b); } // Convert packed signed 32-bit integers in a to packed double-precision // (64-bit) floating-point elements, and store the results in dst. // // FOR j := 0 to 1 // i := j*32 // m := j*64 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); #else double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1); return _mm_set_pd(a1, a0); #endif } // Converts the four signed 32-bit integer values of a to single-precision, // floating-point values // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) { return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // k := 64*j // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32 FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double d0 = ((double *) &rnd)[0]; double d1 = ((double *) &rnd)[1]; return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // k := 64*j // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double d0 = ((double *) &rnd)[0]; double d1 = ((double *) &rnd)[1]; int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; return vreinterpret_m64_s32(vld1_s32(data)); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed single-precision (32-bit) floating-point elements, and store the // results in dst. // // FOR j := 0 to 1 // i := 32*j // k := 64*j // dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) // ENDFOR // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) { #if defined(__aarch64__) float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); #else float a0 = (float) ((double *) &a)[0]; float a1 = (float) ((double *) &a)[1]; return _mm_set_ps(0, 0, a1, a0); #endif } // Convert packed signed 32-bit integers in a to packed double-precision // (64-bit) floating-point elements, and store the results in dst. // // FOR j := 0 to 1 // i := j*32 // m := j*64 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); #else double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0); double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1); return _mm_set_pd(a1, a0); #endif } // Converts the four single-precision, floating-point values of a to signed // 32-bit integer values. // // r0 := (int) a0 // r1 := (int) a1 // r2 := (int) a2 // r3 := (int) a3 // // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A // does not support! It is supported on ARMv8-A however. FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) { #if defined(__aarch64__) switch (_MM_GET_ROUNDING_MODE()) { case _MM_ROUND_NEAREST: return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); case _MM_ROUND_DOWN: return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a)); case _MM_ROUND_UP: return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a)); default: // _MM_ROUND_TOWARD_ZERO return vreinterpretq_m128i_s32(vcvtq_s32_f32(a)); } #else float *f = (float *) &a; switch (_MM_GET_ROUNDING_MODE()) { case _MM_ROUND_NEAREST: { uint32x4_t signmask = vdupq_n_u32(0x80000000); float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */ int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ int32x4_t r_trunc = vcvtq_s32_f32( vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ float32x4_t delta = vsubq_f32( vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ return vreinterpretq_m128i_s32( vbslq_s32(is_delta_half, r_even, r_normal)); } case _MM_ROUND_DOWN: return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); case _MM_ROUND_UP: return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); default: // _MM_ROUND_TOWARD_ZERO return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1], (int32_t) f[0]); } #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed double-precision (64-bit) floating-point elements, and store the // results in dst. // // FOR j := 0 to 1 // i := 64*j // k := 32*j // dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); #else double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); return _mm_set_pd(a1, a0); #endif } // Copy the lower double-precision (64-bit) floating-point element of a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { #if defined(__aarch64__) return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); #else return ((double *) &a)[0]; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // // dst[31:0] := Convert_FP64_To_Int32(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) { #if defined(__aarch64__) return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double ret = ((double *) &rnd)[0]; return (int32_t) ret; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int64(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) { #if defined(__aarch64__) return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double ret = ((double *) &rnd)[0]; return (int64_t) ret; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int64(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x #define _mm_cvtsd_si64x _mm_cvtsd_si64 // Convert the lower double-precision (64-bit) floating-point element in b to a // single-precision (32-bit) floating-point element, store the result in the // lower element of dst, and copy the upper 3 packed elements from a to the // upper elements of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128_f32(vsetq_lane_f32( vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), vreinterpretq_f32_m128(a), 0)); #else return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0], vreinterpretq_f32_m128(a), 0)); #endif } // Copy the lower 32-bit integer in a to dst. // // dst[31:0] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) { return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); } // Copy the lower 64-bit integer in a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) { return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); } // Copy the lower 64-bit integer in a to dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) // Convert the signed 32-bit integer b to a double-precision (64-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else double bf = (double) b; return vreinterpretq_m128d_s64( vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); #endif } // Copy the lower 64-bit integer in a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) // Moves 32-bit integer a to the least significant 32 bits of an __m128 object, // zero extending the upper bits. // // r0 := a // r1 := 0x0 // r2 := 0x0 // r3 := 0x0 // // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) { return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); } // Convert the signed 64-bit integer b to a double-precision (64-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else double bf = (double) b; return vreinterpretq_m128d_s64( vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); #endif } // Moves 64-bit integer a to the least significant 64 bits of an __m128 object, // zero extending the upper bits. // // r0 := a // r1 := 0x0 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) { return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); } // Copy 64-bit integer a to the lower element of dst, and zero the upper // element. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) // Convert the signed 64-bit integer b to a double-precision (64-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) // Convert the lower single-precision (32-bit) floating-point element in b to a // double-precision (64-bit) floating-point element, store the result in the // lower element of dst, and copy the upper element from a to the upper element // of dst. // // dst[63:0] := Convert_FP32_To_FP64(b[31:0]) // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) { double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); #if defined(__aarch64__) return vreinterpretq_m128d_f64( vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); #else return vreinterpretq_m128d_s64( vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); #endif } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) { double a0 = ((double *) &a)[0]; double a1 = ((double *) &a)[1]; return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) { double a0 = ((double *) &a)[0]; double a1 = ((double *) &a)[1]; int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; return vreinterpret_m64_s32(vld1_s32(data)); } // Converts the four single-precision, floating-point values of a to signed // 32-bit integer values using truncate. // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) { return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); } // Convert the lower double-precision (64-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) { double ret = *((double *) &a); return (int32_t) ret; } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) { #if defined(__aarch64__) return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); #else double ret = *((double *) &a); return (int64_t) ret; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) // Divide packed double-precision (64-bit) floating-point elements in a by // packed elements in b, and store the results in dst. // // FOR j := 0 to 1 // i := 64*j // dst[i+63:i] := a[i+63:i] / b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] / db[0]; c[1] = da[1] / db[1]; return vld1q_f32((float32_t *) c); #endif } // Divide the lower double-precision (64-bit) floating-point element in a by the // lower double-precision (64-bit) floating-point element in b, store the result // in the lower element of dst, and copy the upper element from a to the upper // element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) { #if defined(__aarch64__) float64x2_t tmp = vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_f64( vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); #else return _mm_move_sd(a, _mm_div_pd(a, b)); #endif } // Extracts the selected signed or unsigned 16-bit integer from a and zero // extends. // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) #define _mm_extract_epi16(a, imm) \ vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) // Inserts the least significant 16 bits of b into the selected 16-bit integer // of a. // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, // __constrange(0,8) int imm) #define _mm_insert_epi16(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s16( \ vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ }) // Loads two double-precision from 16-byte aligned memory, floating-point // values. // // dst[127:0] := MEM[mem_addr+127:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd FORCE_INLINE __m128d _mm_load_pd(const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vld1q_f64(p)); #else const float *fp = (const float *) p; float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif } // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 #define _mm_load_pd1 _mm_load1_pd // Load a double-precision (64-bit) floating-point element from memory into the // lower of dst, and zero the upper element. mem_addr does not need to be // aligned on any particular boundary. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd FORCE_INLINE __m128d _mm_load_sd(const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); #else const float *fp = (const float *) p; float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif } // Loads 128-bit value. : // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) { return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); } // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd FORCE_INLINE __m128d _mm_load1_pd(const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); #else return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); #endif } // Load a double-precision (64-bit) floating-point element from memory into the // upper element of dst, and copy the lower element from a to dst. mem_addr does // not need to be aligned on any particular boundary. // // dst[63:0] := a[63:0] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); #else return vreinterpretq_m128d_f32(vcombine_f32( vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); #endif } // Load 64-bit integer from memory into the first element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) { /* Load the lower 64 bits of the value pointed to by p into the * lower 64 bits of the result, zeroing the upper 64 bits of the result. */ return vreinterpretq_m128i_s32( vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); } // Load a double-precision (64-bit) floating-point element from memory into the // lower element of dst, and copy the upper element from a to dst. mem_addr does // not need to be aligned on any particular boundary. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); #else return vreinterpretq_m128d_f32( vcombine_f32(vld1_f32((const float *) p), vget_high_f32(vreinterpretq_f32_m128d(a)))); #endif } // Load 2 double-precision (64-bit) floating-point elements from memory into dst // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // dst[63:0] := MEM[mem_addr+127:mem_addr+64] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd FORCE_INLINE __m128d _mm_loadr_pd(const double *p) { #if defined(__aarch64__) float64x2_t v = vld1q_f64(p); return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); #else int64x2_t v = vld1q_s64((const int64_t *) p); return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); #endif } // Loads two double-precision from unaligned memory, floating-point values. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd FORCE_INLINE __m128d _mm_loadu_pd(const double *p) { return _mm_load_pd(p); } // Loads 128-bit value. : // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) { return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); } // Load unaligned 32-bit integer from memory into the first element of dst. // // dst[31:0] := MEM[mem_addr+31:mem_addr] // dst[MAX:32] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 FORCE_INLINE __m128i _mm_loadu_si32(const void *p) { return vreinterpretq_m128i_s32( vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); } // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit // integers from b. // // r0 := (a0 * b0) + (a1 * b1) // r1 := (a2 * b2) + (a3 * b3) // r2 := (a4 * b4) + (a5 * b5) // r3 := (a6 * b6) + (a7 * b7) // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) { int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), vget_low_s16(vreinterpretq_s16_m128i(b))); int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), vget_high_s16(vreinterpretq_s16_m128i(b))); int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); } // Conditionally store 8-bit integer elements from a into memory using mask // (elements are not stored when the highest bit is not set in the corresponding // element) and a non-temporal memory hint. mem_addr does not need to be aligned // on any particular boundary. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128 FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) { int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7); __m128 b = _mm_load_ps((const float *) mem_addr); int8x16_t masked = vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128(b)); vst1q_s8((int8_t *) mem_addr, masked); } // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 // signed 16-bit integers from b. // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the // 16 unsigned 8-bit integers from b. // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b, // and store packed maximum values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) { #if defined(__aarch64__) #if SSE2NEON_PRECISE_MINMAX float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b)); #else return vreinterpretq_m128d_f64( vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0; d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b, store the maximum value in the lower element of dst, and copy the upper // element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_max_pd(a, b)); #else double *da = (double *) &a; double *db = (double *) &b; double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 // signed 16-bit integers from b. // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the // 16 unsigned 8-bit integers from b. // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b, // and store packed minimum values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) { #if defined(__aarch64__) #if SSE2NEON_PRECISE_MINMAX float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b)); #else return vreinterpretq_m128d_f64( vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0; d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b, store the minimum value in the lower element of dst, and copy the upper // element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_min_pd(a, b)); #else double *da = (double *) &a; double *db = (double *) &b; double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } // Copy the lower 64-bit integer in a to the lower element of dst, and zero the // upper element. // // dst[63:0] := a[63:0] // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 FORCE_INLINE __m128i _mm_move_epi64(__m128i a) { return vreinterpretq_m128i_s64( vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); } // Move the lower double-precision (64-bit) floating-point element from b to the // lower element of dst, and copy the upper element from a to the upper element // of dst. // // dst[63:0] := b[63:0] // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) { return vreinterpretq_m128d_f32( vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), vget_high_f32(vreinterpretq_f32_m128d(a)))); } // NEON does not provide a version of this function. // Creates a 16-bit mask from the most significant bits of the 16 signed or // unsigned 8-bit integers in a and zero extends the upper bits. // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx FORCE_INLINE int _mm_movemask_epi8(__m128i a) { // Use increasingly wide shifts+adds to collect the sign bits // together. // Since the widening shifts would be rather confusing to follow in little // endian, everything will be illustrated in big endian order instead. This // has a different result - the bits would actually be reversed on a big // endian machine. // Starting input (only half the elements are shown): // 89 ff 1d c0 00 10 99 33 uint8x16_t input = vreinterpretq_u8_m128i(a); // Shift out everything but the sign bits with an unsigned shift right. // // Bytes of the vector:: // 89 ff 1d c0 00 10 99 33 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) // | | | | | | | | // 01 01 00 01 00 00 01 00 // // Bits of first important lane(s): // 10001001 (89) // \______ // | // 00000001 (01) uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); // Merge the even lanes together with a 16-bit unsigned shift right + add. // 'xx' represents garbage data which will be ignored in the final result. // In the important bytes, the add functions like a binary OR. // // 01 01 00 01 00 00 01 00 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) // \| \| \| \| // xx 03 xx 01 xx 00 xx 02 // // 00000001 00000001 (01 01) // \_______ | // \| // xxxxxxxx xxxxxx11 (xx 03) uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); // Repeat with a wider 32-bit shift + add. // xx 03 xx 01 xx 00 xx 02 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> // 14)) // \| \| // xx xx xx 0d xx xx xx 02 // // 00000011 00000001 (03 01) // \\_____ || // '----.\|| // xxxxxxxx xxxx1101 (xx 0d) uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); // Last, an even wider 64-bit shift + add to get our result in the low 8 bit // lanes. xx xx xx 0d xx xx xx 02 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> // 28)) // \| // xx xx xx xx xx xx xx d2 // // 00001101 00000010 (0d 02) // \ \___ | | // '---. \| | // xxxxxxxx 11010010 (xx d2) uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. // xx xx xx xx xx xx xx d2 // || return paired64[0] // d2 // Note: Little endian would return the correct value 4b (01001011) instead. return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); } // Set each bit of mask dst based on the most significant bit of the // corresponding packed double-precision (64-bit) floating-point element in a. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd FORCE_INLINE int _mm_movemask_pd(__m128d a) { uint64x2_t input = vreinterpretq_u64_m128d(a); uint64x2_t high_bits = vshrq_n_u64(input, 63); return (int) (vgetq_lane_u64(high_bits, 0)) | ((int) (vgetq_lane_u64(high_bits, 1) << 1)); } // Copy the lower 64-bit integer in a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) { return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); } // Copy the 64-bit integer a to the lower element of dst, and zero the upper // element. // // dst[63:0] := a[63:0] // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) { return vreinterpretq_m128i_s64( vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); } // Multiply the low unsigned 32-bit integers from each packed 64-bit element in // a and b, and store the unsigned 64-bit results in dst. // // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) { // vmull_u32 upcasts instead of masking, so we downcast. uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); } // Multiply packed double-precision (64-bit) floating-point elements in a and b, // and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] * db[0]; c[1] = da[1] * db[1]; return vld1q_f32((float32_t *) c); #endif } // Multiply the lower double-precision (64-bit) floating-point element in a and // b, store the result in the lower element of dst, and copy the upper element // from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_mul_pd(a, b)); } // Multiply the low unsigned 32-bit integers from a and b, and store the // unsigned 64-bit result in dst. // // dst[63:0] := a[31:0] * b[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) { return vreinterpret_m64_u64(vget_low_u64( vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); } // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit // integers from b. // // r0 := (a0 * b0)[31:16] // r1 := (a1 * b1)[31:16] // ... // r7 := (a7 * b7)[31:16] // // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) { /* FIXME: issue with large values because of result saturation */ // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); return vreinterpretq_m128i_u16(r.val[1]); } // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) { uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); uint32x4_t ab3210 = vmull_u16(a3210, b3210); #if defined(__aarch64__) uint32x4_t ab7654 = vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); return vreinterpretq_m128i_u16(r); #else uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); uint32x4_t ab7654 = vmull_u16(a7654, b7654); uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); return vreinterpretq_m128i_u16(r.val[1]); #endif } // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or // unsigned 16-bit integers from b. // // r0 := (a0 * b0)[15:0] // r1 := (a1 * b1)[15:0] // ... // r7 := (a7 * b7)[15:0] // // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compute the bitwise OR of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. // // r := a | b // // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and // saturates. // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), vqmovn_s16(vreinterpretq_s16_m128i(b)))); } // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers // and saturates. // // r0 := SignedSaturate(a0) // r1 := SignedSaturate(a1) // r2 := SignedSaturate(a2) // r3 := SignedSaturate(a3) // r4 := SignedSaturate(b0) // r5 := SignedSaturate(b1) // r6 := SignedSaturate(b2) // r7 := SignedSaturate(b3) // // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), vqmovn_s32(vreinterpretq_s32_m128i(b)))); } // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned // integers and saturates. // // r0 := UnsignedSaturate(a0) // r1 := UnsignedSaturate(a1) // ... // r7 := UnsignedSaturate(a7) // r8 := UnsignedSaturate(b0) // r9 := UnsignedSaturate(b1) // ... // r15 := UnsignedSaturate(b7) // // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) { return vreinterpretq_m128i_u8( vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), vqmovun_s16(vreinterpretq_s16_m128i(b)))); } // Pause the processor. This is typically used in spin-wait loops and depending // on the x86 processor typical values are in the 40-100 cycle range. The // 'yield' instruction isn't a good fit because it's effectively a nop on most // Arm cores. Experience with several databases has shown has shown an 'isb' is // a reasonable approximation. FORCE_INLINE void _mm_pause() { __asm__ __volatile__("isb\n"); } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce two // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of 64-bit elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) { uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t))); } // Sets the 8 signed 16-bit integer values. // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0) { int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; return vreinterpretq_m128i_s16(vld1q_s16(data)); } // Sets the 4 signed 32-bit integer values. // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) { int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; return vreinterpretq_m128i_s32(vld1q_s32(data)); } // Returns the __m128i structure with its two 64-bit integer values // initialized to the values of the two 64-bit integers passed in. // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) { return _mm_set_epi64x((int64_t) i1, (int64_t) i2); } // Returns the __m128i structure with its two 64-bit integer values // initialized to the values of the two 64-bit integers passed in. // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) { return vreinterpretq_m128i_s64( vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); } // Sets the 16 signed 8-bit integer values. // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0) { int8_t ALIGN_STRUCT(16) data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } // Set packed double-precision (64-bit) floating-point elements in dst with the // supplied values. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) { double ALIGN_STRUCT(16) data[2] = {e0, e1}; #if defined(__aarch64__) return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); #else return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); #endif } // Broadcast double-precision (64-bit) floating-point value a to all elements of // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1 #define _mm_set_pd1 _mm_set1_pd // Copy double-precision (64-bit) floating-point element a to the lower element // of dst, and zero the upper element. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd FORCE_INLINE __m128d _mm_set_sd(double a) { return _mm_set_pd(0, a); } // Sets the 8 signed 16-bit integer values to w. // // r0 := w // r1 := w // ... // r7 := w // // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx FORCE_INLINE __m128i _mm_set1_epi16(short w) { return vreinterpretq_m128i_s16(vdupq_n_s16(w)); } // Sets the 4 signed 32-bit integer values to i. // // r0 := i // r1 := i // r2 := i // r3 := I // // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx FORCE_INLINE __m128i _mm_set1_epi32(int _i) { return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); } // Sets the 2 signed 64-bit integer values to i. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) { return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i)); } // Sets the 2 signed 64-bit integer values to i. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) { return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); } // Sets the 16 signed 8-bit integer values to b. // // r0 := b // r1 := b // ... // r15 := b // // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx FORCE_INLINE __m128i _mm_set1_epi8(signed char w) { return vreinterpretq_m128i_s8(vdupq_n_s8(w)); } // Broadcast double-precision (64-bit) floating-point value a to all elements of // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd FORCE_INLINE __m128d _mm_set1_pd(double d) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vdupq_n_f64(d)); #else return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); #endif } // Sets the 8 signed 16-bit integer values in reverse order. // // Return Value // r0 := w0 // r1 := w1 // ... // r7 := w7 FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) { int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); } // Sets the 4 signed 32-bit integer values in reverse order // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) { int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; return vreinterpretq_m128i_s32(vld1q_s32(data)); } // Set packed 64-bit integers in dst with the supplied values in reverse order. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) { return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); } // Sets the 16 signed 8-bit integer values in reverse order. // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15) { int8_t ALIGN_STRUCT(16) data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } // Set packed double-precision (64-bit) floating-point elements in dst with the // supplied values in reverse order. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) { return _mm_set_pd(e0, e1); } // Return vector of type __m128d with all elements set to zero. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd FORCE_INLINE __m128d _mm_setzero_pd(void) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vdupq_n_f64(0)); #else return vreinterpretq_m128d_f32(vdupq_n_f32(0)); #endif } // Sets the 128-bit value to zero // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx FORCE_INLINE __m128i _mm_setzero_si128(void) { return vreinterpretq_m128i_s32(vdupq_n_s32(0)); } // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, // __constrange(0,255) int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_epi32(a, imm) \ __extension__({ \ int32x4_t _input = vreinterpretq_s32_m128i(a); \ int32x4_t _shuf = __builtin_shufflevector( \ _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ vreinterpretq_m128i_s32(_shuf); \ }) #else // generic #define _mm_shuffle_epi32(a, imm) \ __extension__({ \ __m128i ret; \ switch (imm) { \ case _MM_SHUFFLE(1, 0, 3, 2): \ ret = _mm_shuffle_epi_1032((a)); \ break; \ case _MM_SHUFFLE(2, 3, 0, 1): \ ret = _mm_shuffle_epi_2301((a)); \ break; \ case _MM_SHUFFLE(0, 3, 2, 1): \ ret = _mm_shuffle_epi_0321((a)); \ break; \ case _MM_SHUFFLE(2, 1, 0, 3): \ ret = _mm_shuffle_epi_2103((a)); \ break; \ case _MM_SHUFFLE(1, 0, 1, 0): \ ret = _mm_shuffle_epi_1010((a)); \ break; \ case _MM_SHUFFLE(1, 0, 0, 1): \ ret = _mm_shuffle_epi_1001((a)); \ break; \ case _MM_SHUFFLE(0, 1, 0, 1): \ ret = _mm_shuffle_epi_0101((a)); \ break; \ case _MM_SHUFFLE(2, 2, 1, 1): \ ret = _mm_shuffle_epi_2211((a)); \ break; \ case _MM_SHUFFLE(0, 1, 2, 2): \ ret = _mm_shuffle_epi_0122((a)); \ break; \ case _MM_SHUFFLE(3, 3, 3, 2): \ ret = _mm_shuffle_epi_3332((a)); \ break; \ case _MM_SHUFFLE(0, 0, 0, 0): \ ret = _mm_shuffle_epi32_splat((a), 0); \ break; \ case _MM_SHUFFLE(1, 1, 1, 1): \ ret = _mm_shuffle_epi32_splat((a), 1); \ break; \ case _MM_SHUFFLE(2, 2, 2, 2): \ ret = _mm_shuffle_epi32_splat((a), 2); \ break; \ case _MM_SHUFFLE(3, 3, 3, 3): \ ret = _mm_shuffle_epi32_splat((a), 3); \ break; \ default: \ ret = _mm_shuffle_epi32_default((a), (imm)); \ break; \ } \ ret; \ }) #endif // Shuffle double-precision (64-bit) floating-point elements using the control // in imm8, and store the results in dst. // // dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] // dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_pd(a, b, imm8) \ vreinterpretq_m128d_s64(__builtin_shufflevector( \ vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ ((imm8 & 0x2) >> 1) + 2)) #else #define _mm_shuffle_pd(a, b, imm8) \ _mm_castsi128_pd(_mm_set_epi64x( \ vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) #endif // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, // __constrange(0,255) int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shufflehi_epi16(a, imm) \ __extension__({ \ int16x8_t _input = vreinterpretq_s16_m128i(a); \ int16x8_t _shuf = __builtin_shufflevector( \ _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ (((imm) >> 6) & 0x3) + 4); \ vreinterpretq_m128i_s16(_shuf); \ }) #else // generic #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) #endif // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, // __constrange(0,255) int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shufflelo_epi16(a, imm) \ __extension__({ \ int16x8_t _input = vreinterpretq_s16_m128i(a); \ int16x8_t _shuf = __builtin_shufflevector( \ _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ vreinterpretq_m128i_s16(_shuf); \ }) #else // generic #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) #endif // Shift packed 16-bit integers in a left by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF count[63:0] > 15 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~15)) return _mm_setzero_si128(); int16x8_t vc = vdupq_n_s16((int16_t) c); return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); } // Shift packed 32-bit integers in a left by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF count[63:0] > 31 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~31)) return _mm_setzero_si128(); int32x4_t vc = vdupq_n_s32((int32_t) c); return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); } // Shift packed 64-bit integers in a left by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 1 // i := j*64 // IF count[63:0] > 63 // dst[i+63:i] := 0 // ELSE // dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~63)) return _mm_setzero_si128(); int64x2_t vc = vdupq_n_s64((int64_t) c); return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); } // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF imm8[7:0] > 15 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16 FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~15)) return _mm_setzero_si128(); return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16((int16_t) imm))); } // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF imm8[7:0] > 31 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~31)) return _mm_setzero_si128(); return vreinterpretq_m128i_s32( vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); } // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 1 // i := j*64 // IF imm8[7:0] > 63 // dst[i+63:i] := 0 // ELSE // dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~63)) return _mm_setzero_si128(); return vreinterpretq_m128i_s64( vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); } // Shift a left by imm8 bytes while shifting in zeros, and store the results in // dst. // // tmp := imm8[7:0] // IF tmp > 15 // tmp := 16 // FI // dst[127:0] := a[127:0] << (tmp*8) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128 FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~15)) return _mm_setzero_si128(); uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)}; return vreinterpretq_m128i_u8( vld1q_u8(((uint8_t const *) tmp) + (16 - imm))); } // Compute the square root of packed double-precision (64-bit) floating-point // elements in a, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); #else double a0 = SQRT(((double *) &a)[0]); double a1 = SQRT(((double *) &a)[1]); return _mm_set_pd(a1, a0); #endif } // Compute the square root of the lower double-precision (64-bit) floating-point // element in b, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_sqrt_pd(b)); #else return _mm_set_pd(((double *) &a)[1], SQRT(((double *) &b)[0])); #endif } // Shift packed 16-bit integers in a right by count while shifting in sign bits, // and store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF count[63:0] > 15 // dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) // ELSE // dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) { int64_t c = (int64_t) vget_low_s64((int64x2_t) count); if (_sse2neon_unlikely(c & ~15)) return _mm_cmplt_epi16(a, _mm_setzero_si128()); return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t) (-c)))); } // Shift packed 32-bit integers in a right by count while shifting in sign bits, // and store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF count[63:0] > 31 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) // ELSE // dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) { int64_t c = (int64_t) vget_low_s64((int64x2_t) count); if (_sse2neon_unlikely(c & ~31)) return _mm_cmplt_epi32(a, _mm_setzero_si128()); return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32((int16_t)(-c)))); } // Shift packed 16-bit integers in a right by imm8 while shifting in sign // bits, and store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF imm8[7:0] > 15 // dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) // ELSE // dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) { const int count = (imm & ~15) ? 15 : imm; return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16((int16_t)(-count))); } // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, // and store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF imm8[7:0] > 31 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) // ELSE // dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) #define _mm_srai_epi32(a, imm) \ __extension__({ \ __m128i ret; \ if (_sse2neon_unlikely((imm) == 0)) { \ ret = a; \ } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \ ret = vreinterpretq_m128i_s32( \ vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ } else { \ ret = vreinterpretq_m128i_s32( \ vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ } \ ret; \ }) // Shift packed 16-bit integers in a right by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF count[63:0] > 15 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~15)) return _mm_setzero_si128(); int16x8_t vc = vdupq_n_s16(-(int16_t) c); return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); } // Shift packed 32-bit integers in a right by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF count[63:0] > 31 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~31)) return _mm_setzero_si128(); int32x4_t vc = vdupq_n_s32(-(int32_t) c); return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); } // Shift packed 64-bit integers in a right by count while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 1 // i := j*64 // IF count[63:0] > 63 // dst[i+63:i] := 0 // ELSE // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~63)) return _mm_setzero_si128(); int64x2_t vc = vdupq_n_s64(-(int64_t) c); return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); } // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF imm8[7:0] > 15 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 #define _mm_srli_epi16(a, imm) \ __extension__({ \ __m128i ret; \ if (_sse2neon_unlikely((imm) & ~15)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u16( \ vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \ } \ ret; \ }) // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF imm8[7:0] > 31 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) #define _mm_srli_epi32(a, imm) \ __extension__({ \ __m128i ret; \ if (_sse2neon_unlikely((imm) & ~31)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u32( \ vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \ } \ ret; \ }) // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 1 // i := j*64 // IF imm8[7:0] > 63 // dst[i+63:i] := 0 // ELSE // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 #define _mm_srli_epi64(a, imm) \ __extension__({ \ __m128i ret; \ if (_sse2neon_unlikely((imm) & ~63)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u64( \ vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \ } \ ret; \ }) // Shift a right by imm8 bytes while shifting in zeros, and store the results in // dst. // // tmp := imm8[7:0] // IF tmp > 15 // tmp := 16 // FI // dst[127:0] := a[127:0] >> (tmp*8) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128 FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~15)) return _mm_setzero_si128(); uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)}; return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm)); } // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary // or a general-protection exception may be generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); #else vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); #endif } // Store the lower double-precision (64-bit) floating-point element from a into // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) { #if defined(__aarch64__) float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); #else float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); #endif } // Store the lower double-precision (64-bit) floating-point element from a into // memory. mem_addr does not need to be aligned on any particular boundary. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); #else vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); #endif } // Stores four 32-bit integer values as (as a __m128i value) at the address p. // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) { vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); } // Store the lower double-precision (64-bit) floating-point element from a into // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd #define _mm_store1_pd _mm_store_pd1 // Store the upper double-precision (64-bit) floating-point element from a into // memory. // // MEM[mem_addr+63:mem_addr] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); #else vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); #endif } // Reads the lower 64 bits of b and stores them into the lower 64 bits of a. // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) { uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); } // Store the lower double-precision (64-bit) floating-point element from a into // memory. // // MEM[mem_addr+63:mem_addr] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); #else vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); #endif } // Store 2 double-precision (64-bit) floating-point elements from a into memory // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // MEM[mem_addr+63:mem_addr] := a[127:64] // MEM[mem_addr+127:mem_addr+64] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) { float32x4_t f = vreinterpretq_f32_m128d(a); _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); } // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory. mem_addr does not need to be aligned on any // particular boundary. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) { _mm_store_pd(mem_addr, a); } // Stores 128-bits of integer data a at the address p. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) { vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); } // Stores 32-bits of integer data a at the address p. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32 FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) { vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0); } // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory using a non-temporal memory hint. mem_addr must // be aligned on a 16-byte boundary or a general-protection exception may be // generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, (float32x4_t *) p); #elif defined(__aarch64__) vst1q_f64(p, vreinterpretq_f64_m128d(a)); #else vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a)); #endif } // Stores the data in a to the address p without polluting the caches. If the // cache line containing address p is already in the cache, the cache will be // updated. // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, p); #else vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); #endif } // Store 32-bit integer a into memory using a non-temporal hint to minimize // cache pollution. If the cache line containing address mem_addr is already in // the cache, the cache will be updated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32 FORCE_INLINE void _mm_stream_si32(int *p, int a) { vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0); } // Store 64-bit integer a into memory using a non-temporal hint to minimize // cache pollution. If the cache line containing address mem_addr is already in // the cache, the cache will be updated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64 FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a) { vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a)); } // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and // store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or // unsigned 32-bit integers of a. // // r0 := a0 - b0 // r1 := a1 - b1 // r2 := a2 - b2 // r3 := a3 - b3 // // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, // and store the results in dst. // r0 := a0 - b0 // r1 := a1 - b1 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) { return vreinterpretq_m128i_s64( vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); } // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and // store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Subtract packed double-precision (64-bit) floating-point elements in b from // packed double-precision (64-bit) floating-point elements in a, and store the // results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := a[i+63:i] - b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] - db[0]; c[1] = da[1] - db[1]; return vld1q_f32((float32_t *) c); #endif } // Subtract the lower double-precision (64-bit) floating-point element in b from // the lower double-precision (64-bit) floating-point element in a, store the // result in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_sub_pd(a, b)); } // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. // // dst[63:0] := a[63:0] - b[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) { return vreinterpret_m64_s64( vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); } // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers // of a and saturates. // // r0 := SignedSaturate(a0 - b0) // r1 := SignedSaturate(a1 - b1) // ... // r7 := SignedSaturate(a7 - b7) // // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers // of a and saturates. // // r0 := SignedSaturate(a0 - b0) // r1 := SignedSaturate(a1 - b1) // ... // r15 := SignedSaturate(a15 - b15) // // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit // integers of a and saturates.. // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit // integers of a and saturates. // // r0 := UnsignedSaturate(a0 - b0) // r1 := UnsignedSaturate(a1 - b1) // ... // r15 := UnsignedSaturate(a15 - b15) // // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } #define _mm_ucomieq_sd _mm_comieq_sd #define _mm_ucomige_sd _mm_comige_sd #define _mm_ucomigt_sd _mm_comigt_sd #define _mm_ucomile_sd _mm_comile_sd #define _mm_ucomilt_sd _mm_comilt_sd #define _mm_ucomineq_sd _mm_comineq_sd // Return vector of type __m128d with undefined elements. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd FORCE_INLINE __m128d _mm_undefined_pd(void) { #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic push //#pragma GCC diagnostic ignored "-Wuninitialized" #endif // __m128d a; __m128d a= (__m128d) ZERO_SSE2NEON(); BUG_SSE2NEON; return a; #if defined(__GNUC__) || defined(__clang__) //#pragma GCC diagnostic pop #endif } // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the // upper 4 signed or unsigned 16-bit integers in b. // // r0 := a4 // r1 := b4 // r2 := a5 // r3 := b5 // r4 := a6 // r5 := b6 // r6 := a7 // r7 := b7 // // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s16( vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); int16x4x2_t result = vzip_s16(a1, b1); return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); #endif } // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the // upper 2 signed or unsigned 32-bit integers in b. // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s32( vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); int32x2x2_t result = vzip_s32(a1, b1); return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); #endif } // Interleaves the upper signed or unsigned 64-bit integer in a with the // upper signed or unsigned 64-bit integer in b. // // r0 := a1 // r1 := b1 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); } // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper // 8 signed or unsigned 8-bit integers in b. // // r0 := a8 // r1 := b8 // r2 := a9 // r3 := b9 // ... // r14 := a15 // r15 := b15 // // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s8( vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); int8x8x2_t result = vzip_s8(a1, b1); return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); #endif } // Unpack and interleave double-precision (64-bit) floating-point elements from // the high half of a and b, and store the results in dst. // // DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { // dst[63:0] := src1[127:64] // dst[127:64] := src2[127:64] // RETURN dst[127:0] // } // dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else return vreinterpretq_m128d_s64( vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), vget_high_s64(vreinterpretq_s64_m128d(b)))); #endif } // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the // lower 4 signed or unsigned 16-bit integers in b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // r4 := a2 // r5 := b2 // r6 := a3 // r7 := b3 // // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s16( vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); int16x4x2_t result = vzip_s16(a1, b1); return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); #endif } // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the // lower 2 signed or unsigned 32 - bit integers in b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s32( vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); int32x2x2_t result = vzip_s32(a1, b1); return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); #endif } FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); } // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower // 8 signed or unsigned 8-bit integers in b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // ... // r14 := a7 // r15 := b7 // // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s8( vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); int8x8x2_t result = vzip_s8(a1, b1); return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); #endif } // Unpack and interleave double-precision (64-bit) floating-point elements from // the low half of a and b, and store the results in dst. // // DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { // dst[63:0] := src1[63:0] // dst[127:64] := src2[63:0] // RETURN dst[127:0] // } // dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else return vreinterpretq_m128d_s64( vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), vget_low_s64(vreinterpretq_s64_m128d(b)))); #endif } // Compute the bitwise XOR of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := a[i+63:i] XOR b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } /* SSE3 */ // Alternatively add and subtract packed double-precision (64-bit) // floating-point elements in a to/from packed elements in b, and store the // results in dst. // // FOR j := 0 to 1 // i := j*64 // IF ((j & 1) == 0) // dst[i+63:i] := a[i+63:i] - b[i+63:i] // ELSE // dst[i+63:i] := a[i+63:i] + b[i+63:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) { static const __m128d mask = _mm_set_pd(1.0f, -1.0f); #if defined(__aarch64__) return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(mask))); #else return _mm_add_pd(_mm_mul_pd(b, mask), a); #endif } // Alternatively add and subtract packed single-precision (32-bit) // floating-point elements in a to/from packed elements in b, and store the // results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) { static const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f); #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */ return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(mask), vreinterpretq_f32_m128(b))); #else return _mm_add_ps(_mm_mul_ps(b, mask), a); #endif } // Horizontally add adjacent pairs of double-precision (64-bit) floating-point // elements in a and b, and pack the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[] = {da[0] + da[1], db[0] + db[1]}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } // Computes pairwise add of each argument as single-precision, floating-point // values a and b. // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) { #if defined(__aarch64__) return vreinterpretq_m128_f32( vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32( vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); #endif } // Horizontally subtract adjacent pairs of double-precision (64-bit) // floating-point elements in a and b, and pack the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) { #if defined(__aarch64__) float64x2_t a = vreinterpretq_f64_m128d(_a); float64x2_t b = vreinterpretq_f64_m128d(_b); return vreinterpretq_m128d_f64( vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b))); #else double *da = (double *) &_a; double *db = (double *) &_b; double c[] = {da[0] - da[1], db[0] - db[1]}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } // Horizontally subtract adjacent pairs of single-precision (32-bit) // floating-point elements in a and b, and pack the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) { float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); #if defined(__aarch64__) return vreinterpretq_m128_f32( vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b))); #else float32x4x2_t c = vuzpq_f32(a, b); return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); #endif } // Load 128-bits of integer data from unaligned memory into dst. This intrinsic // may perform better than _mm_loadu_si128 when the data crosses a cache line // boundary. // // dst[127:0] := MEM[mem_addr+127:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 #define _mm_lddqu_si128 _mm_loadu_si128 // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd #define _mm_loaddup_pd _mm_load1_pd // Duplicate the low double-precision (64-bit) floating-point element from a, // and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); #else return vreinterpretq_m128d_u64( vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); #endif } // Duplicate odd-indexed single-precision (32-bit) floating-point elements // from a, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) { #if __has_builtin(__builtin_shufflevector) return vreinterpretq_m128_f32(__builtin_shufflevector( vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); #else float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; return vreinterpretq_m128_f32(vld1q_f32(data)); #endif } // Duplicate even-indexed single-precision (32-bit) floating-point elements // from a, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) { #if __has_builtin(__builtin_shufflevector) return vreinterpretq_m128_f32(__builtin_shufflevector( vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); #else float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; return vreinterpretq_m128_f32(vld1q_f32(data)); #endif } /* SSSE3 */ // Compute the absolute value of packed signed 16-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 7 // i := j*16 // dst[i+15:i] := ABS(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) { return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); } // Compute the absolute value of packed signed 32-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 3 // i := j*32 // dst[i+31:i] := ABS(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) { return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); } // Compute the absolute value of packed signed 8-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 15 // i := j*8 // dst[i+7:i] := ABS(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) { return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); } // Compute the absolute value of packed signed 16-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := ABS(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) { return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); } // Compute the absolute value of packed signed 32-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 1 // i := j*32 // dst[i+31:i] := ABS(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) { return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); } // Compute the absolute value of packed signed 8-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := ABS(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) { return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); } // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift // the result right by imm8 bytes, and store the low 16 bytes in dst. // // tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) // dst[127:0] := tmp[127:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8 FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm) { if (_sse2neon_unlikely(imm & ~31)) return _mm_setzero_si128(); int idx; uint8x16_t tmp[2]; if (imm >= 16) { idx = imm - 16; tmp[0] = vreinterpretq_u8_m128i(a); tmp[1] = vdupq_n_u8(0); } else { idx = imm; tmp[0] = vreinterpretq_u8_m128i(b); tmp[1] = vreinterpretq_u8_m128i(a); } return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx)); } // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift // the result right by imm8 bytes, and store the low 8 bytes in dst. // // tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) // dst[63:0] := tmp[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8 #define _mm_alignr_pi8(a, b, imm) \ __extension__({ \ __m64 ret; \ if (_sse2neon_unlikely((imm) >= 16)) { \ ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ } else { \ uint8x8_t tmp_low, tmp_high; \ if ((imm) >= 8) { \ const int idx = (imm) -8; \ tmp_low = vreinterpret_u8_m64(a); \ tmp_high = vdup_n_u8(0); \ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ } else { \ const int idx = (imm); \ tmp_low = vreinterpret_u8_m64(b); \ tmp_high = vreinterpret_u8_m64(a); \ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ } \ } \ ret; \ }) // Computes pairwise add of each argument as a 16-bit signed or unsigned integer // values a and b. FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); #if defined(__aarch64__) return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); #else return vreinterpretq_m128i_s16( vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); #endif } // Computes pairwise add of each argument as a 32-bit signed or unsigned integer // values a and b. FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); return vreinterpretq_m128i_s32( vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); } // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the // signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the // signed 32-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) { return vreinterpret_m64_s32( vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); } // Computes saturated pairwise sub of each argument as a 16-bit signed // integer values a and b. FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) { #if defined(__aarch64__) int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); return vreinterpretq_s64_s16( vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); // Interleave using vshrn/vmovn // [a0|a2|a4|a6|b0|b2|b4|b6] // [a1|a3|a5|a7|b1|b3|b5|b7] int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); // Saturated add return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); #endif } // Horizontally add adjacent pairs of signed 16-bit integers in a and b using // saturation, and pack the signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16 FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); #if defined(__aarch64__) return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t res = vuzp_s16(a, b); return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1])); #endif } // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack // the signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); #if defined(__aarch64__) return vreinterpretq_m128i_s16( vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int16x8x2_t c = vuzpq_s16(a, b); return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack // the signed 32-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); #if defined(__aarch64__) return vreinterpretq_m128i_s32( vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b))); #else int32x4x2_t c = vuzpq_s32(a, b); return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack // the signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16 FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); #if defined(__aarch64__) return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t c = vuzp_s16(a, b); return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack // the signed 32-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32 FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) { int32x2_t a = vreinterpret_s32_m64(_a); int32x2_t b = vreinterpret_s32_m64(_b); #if defined(__aarch64__) return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b))); #else int32x2x2_t c = vuzp_s32(a, b); return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1])); #endif } // Computes saturated pairwise difference of each argument as a 16-bit signed // integer values a and b. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); #if defined(__aarch64__) return vreinterpretq_m128i_s16( vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int16x8x2_t c = vuzpq_s16(a, b); return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b // using saturation, and pack the signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16 FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); #if defined(__aarch64__) return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t c = vuzp_s16(a, b); return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1])); #endif } // Vertically multiply each unsigned 8-bit integer from a with the corresponding // signed 8-bit integer from b, producing intermediate signed 16-bit integers. // Horizontally add adjacent pairs of intermediate signed 16-bit integers, // and pack the saturated results in dst. // // FOR j := 0 to 7 // i := j*16 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + // a[i+7:i]*b[i+7:i] ) // ENDFOR FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) { #if defined(__aarch64__) uint8x16_t a = vreinterpretq_u8_m128i(_a); int8x16_t b = vreinterpretq_s8_m128i(_b); int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b))); int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b))); return vreinterpretq_m128i_s16( vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); #else // This would be much simpler if x86 would choose to zero extend OR sign // extend, not both. This could probably be optimized better. uint16x8_t a = vreinterpretq_u16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); // Zero extend a int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); // Sign extend by shifting left then shifting right. int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); int16x8_t b_odd = vshrq_n_s16(b, 8); // multiply int16x8_t prod1 = vmulq_s16(a_even, b_even); int16x8_t prod2 = vmulq_s16(a_odd, b_odd); // saturated add return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); #endif } // Vertically multiply each unsigned 8-bit integer from a with the corresponding // signed 8-bit integer from b, producing intermediate signed 16-bit integers. // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and // pack the saturated results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16 FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b) { uint16x4_t a = vreinterpret_u16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); // Zero extend a int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8)); int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff))); // Sign extend by shifting left then shifting right. int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8); int16x4_t b_odd = vshr_n_s16(b, 8); // multiply int16x4_t prod1 = vmul_s16(a_even, b_even); int16x4_t prod2 = vmul_s16(a_odd, b_odd); // saturated add return vreinterpret_m64_s16(vqadd_s16(prod1, prod2)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate // signed 32-bit integers. Shift right by 15 bits while rounding up, and store // the packed 16-bit integers in dst. // // r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) // r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) // r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) // ... // r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) { // Has issues due to saturation // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); // Multiply int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), vget_low_s16(vreinterpretq_s16_m128i(b))); int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), vget_high_s16(vreinterpretq_s16_m128i(b))); // Rounding narrowing shift right // narrow = (int16_t)((mul + 16384) >> 15); int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); // Join together return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate // signed 32-bit integers. Truncate each intermediate integer to the 18 most // significant bits, round by adding 1, and store bits [16:1] to dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16 FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) { int32x4_t mul_extend = vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b))); // Rounding narrowing shift right return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15)); } // Shuffle packed 8-bit integers in a according to shuffle control mask in the // corresponding 8-bit element of b, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits #if defined(__aarch64__) return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); #elif defined(__GNUC__) int8x16_t ret; // %e and %f represent the even and odd D registers // respectively. __asm__ __volatile__( "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" : [ret] "=&w"(ret) : [tbl] "w"(tbl), [idx] "w"(idx_masked)); return vreinterpretq_m128i_s8(ret); #else // use this line if testing on aarch64 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; return vreinterpretq_m128i_s8( vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), vtbl2_s8(a_split, vget_high_u8(idx_masked)))); #endif } // Shuffle packed 8-bit integers in a according to shuffle control mask in the // corresponding 8-bit element of b, and store the results in dst. // // FOR j := 0 to 7 // i := j*8 // IF b[i+7] == 1 // dst[i+7:i] := 0 // ELSE // index[2:0] := b[i+2:i] // dst[i+7:i] := a[index*8+7:index*8] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8 FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) { const int8x8_t controlMask = vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t)(0x1 << 7 | 0x07))); int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask); return vreinterpret_m64_s8(res); } // Negate packed 16-bit integers in a when the corresponding signed // 16-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // // for i in 0..7 // if b[i] < 0 // r[i] := -a[i] // else if b[i] == 0 // r[i] := 0 // else // r[i] := a[i] // fi // done FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFF : 0 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); #else int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); #endif // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative // 'a') based on ltMask int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); // res = masked & (~zeroMask) int16x8_t res = vbicq_s16(masked, zeroMask); return vreinterpretq_m128i_s16(res); } // Negate packed 32-bit integers in a when the corresponding signed // 32-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // // for i in 0..3 // if b[i] < 0 // r[i] := -a[i] // else if b[i] == 0 // r[i] := 0 // else // r[i] := a[i] // fi // done FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFFFFFF : 0 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 #if defined(__aarch64__) int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); #else int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); #endif // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative // 'a') based on ltMask int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); // res = masked & (~zeroMask) int32x4_t res = vbicq_s32(masked, zeroMask); return vreinterpretq_m128i_s32(res); } // Negate packed 8-bit integers in a when the corresponding signed // 8-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // // for i in 0..15 // if b[i] < 0 // r[i] := -a[i] // else if b[i] == 0 // r[i] := 0 // else // r[i] := a[i] // fi // done FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) { int8x16_t a = vreinterpretq_s8_m128i(_a); int8x16_t b = vreinterpretq_s8_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFF : 0 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 #if defined(__aarch64__) int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); #else int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); #endif // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a') // based on ltMask int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); // res = masked & (~zeroMask) int8x16_t res = vbicq_s8(masked, zeroMask); return vreinterpretq_m128i_s8(res); } // Negate packed 16-bit integers in a when the corresponding signed 16-bit // integer in b is negative, and store the results in dst. Element in dst are // zeroed out when the corresponding element in b is zero. // // FOR j := 0 to 3 // i := j*16 // IF b[i+15:i] < 0 // dst[i+15:i] := -(a[i+15:i]) // ELSE IF b[i+15:i] == 0 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := a[i+15:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFF : 0 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); #else int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); #endif // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a') // based on ltMask int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); // res = masked & (~zeroMask) int16x4_t res = vbic_s16(masked, zeroMask); return vreinterpret_m64_s16(res); } // Negate packed 32-bit integers in a when the corresponding signed 32-bit // integer in b is negative, and store the results in dst. Element in dst are // zeroed out when the corresponding element in b is zero. // // FOR j := 0 to 1 // i := j*32 // IF b[i+31:i] < 0 // dst[i+31:i] := -(a[i+31:i]) // ELSE IF b[i+31:i] == 0 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := a[i+31:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) { int32x2_t a = vreinterpret_s32_m64(_a); int32x2_t b = vreinterpret_s32_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFFFFFF : 0 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 #if defined(__aarch64__) int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); #else int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); #endif // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a') // based on ltMask int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); // res = masked & (~zeroMask) int32x2_t res = vbic_s32(masked, zeroMask); return vreinterpret_m64_s32(res); } // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer // in b is negative, and store the results in dst. Element in dst are zeroed out // when the corresponding element in b is zero. // // FOR j := 0 to 7 // i := j*8 // IF b[i+7:i] < 0 // dst[i+7:i] := -(a[i+7:i]) // ELSE IF b[i+7:i] == 0 // dst[i+7:i] := 0 // ELSE // dst[i+7:i] := a[i+7:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) { int8x8_t a = vreinterpret_s8_m64(_a); int8x8_t b = vreinterpret_s8_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFF : 0 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 #if defined(__aarch64__) int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); #else int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); #endif // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a') // based on ltMask int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); // res = masked & (~zeroMask) int8x8_t res = vbic_s8(masked, zeroMask); return vreinterpret_m64_s8(res); } /* SSE4.1 */ // Blend packed 16-bit integers from a and b using control mask imm8, and store // the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF imm8[j] // dst[i+15:i] := b[i+15:i] // ELSE // dst[i+15:i] := a[i+15:i] // FI // ENDFOR // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, // __constrange(0,255) int imm) #define _mm_blend_epi16(a, b, imm) \ __extension__({ \ const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \ uint16x8_t _mask_vec = vld1q_u16(_mask); \ uint16x8_t _a = vreinterpretq_u16_m128i(a); \ uint16x8_t _b = vreinterpretq_u16_m128i(b); \ vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ }) // Blend packed double-precision (64-bit) floating-point elements from a and b // using control mask imm8, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd #define _mm_blend_pd(a, b, imm) \ __extension__({ \ const uint64_t _mask[2] = { \ ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \ uint64x2_t _mask_vec = vld1q_u64(_mask); \ uint64x2_t _a = vreinterpretq_u64_m128d(a); \ uint64x2_t _b = vreinterpretq_u64_m128d(b); \ vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \ }) // Blend packed single-precision (32-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) { const uint32_t ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, ((imm8) & (1 << 1)) ? UINT32_MAX : 0, ((imm8) & (1 << 2)) ? UINT32_MAX : 0, ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; uint32x4_t mask = vld1q_u32(data); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); } // Blend packed 8-bit integers from a and b using mask, and store the results in // dst. // // FOR j := 0 to 15 // i := j*8 // IF mask[i+7] // dst[i+7:i] := b[i+7:i] // ELSE // dst[i+7:i] := a[i+7:i] // FI // ENDFOR FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) { // Use a signed shift right to create a mask with the sign bit uint8x16_t mask = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); uint8x16_t a = vreinterpretq_u8_m128i(_a); uint8x16_t b = vreinterpretq_u8_m128i(_b); return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); } // Blend packed double-precision (64-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) { uint64x2_t mask = vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); #if defined(__aarch64__) float64x2_t a = vreinterpretq_f64_m128d(_a); float64x2_t b = vreinterpretq_f64_m128d(_b); return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); #else uint64x2_t a = vreinterpretq_u64_m128d(_a); uint64x2_t b = vreinterpretq_u64_m128d(_b); return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); #endif } // Blend packed single-precision (32-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) { // Use a signed shift right to create a mask with the sign bit uint32x4_t mask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); } // Round the packed double-precision (64-bit) floating-point elements in a up // to an integer value, and store the results as packed double-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); #else double *f = (double *) &a; return _mm_set_pd(CEIL(f[1]), CEIL(f[0])); #endif } // Round the packed single-precision (32-bit) floating-point elements in a up to // an integer value, and store the results as packed single-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) { #if defined(__aarch64__) return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); #else float *f = (float *) &a; return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); #endif } // Round the lower double-precision (64-bit) floating-point element in b up to // an integer value, store the result as a double-precision floating-point // element in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_ceil_pd(b)); } // Round the lower single-precision (32-bit) floating-point element in b up to // an integer value, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. // // dst[31:0] := CEIL(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_ceil_ps(b)); } // Compare packed 64-bit integers in a and b for equality, and store the results // in dst FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_u64( vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); #else // ARMv7 lacks vceqq_u64 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); #endif } // Converts the four signed 16-bit integers in the lower 64 bits to four signed // 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) { return vreinterpretq_m128i_s32( vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); } // Converts the two signed 16-bit integers in the lower 32 bits two signed // 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) { int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_s64(s64x2); } // Converts the two signed 32-bit integers in the lower 64 bits to two signed // 64-bit integers. FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) { return vreinterpretq_m128i_s64( vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); } // Converts the four unsigned 8-bit integers in the lower 16 bits to four // unsigned 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ return vreinterpretq_m128i_s16(s16x8); } // Converts the four unsigned 8-bit integers in the lower 32 bits to four // unsigned 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ return vreinterpretq_m128i_s32(s32x4); } // Converts the two signed 8-bit integers in the lower 32 bits to four // signed 64-bit integers. FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_s64(s64x2); } // Converts the four unsigned 16-bit integers in the lower 64 bits to four // unsigned 32-bit integers. FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) { return vreinterpretq_m128i_u32( vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); } // Converts the two unsigned 16-bit integers in the lower 32 bits to two // unsigned 64-bit integers. FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) { uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_u64(u64x2); } // Converts the two unsigned 32-bit integers in the lower 64 bits to two // unsigned 64-bit integers. FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) { return vreinterpretq_m128i_u64( vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); } // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, // and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */ return vreinterpretq_m128i_u16(u16x8); } // Converts the four unsigned 8-bit integers in the lower 32 bits to four // unsigned 32-bit integers. // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ return vreinterpretq_m128i_u32(u32x4); } // Converts the two unsigned 8-bit integers in the lower 16 bits to two // unsigned 64-bit integers. FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_u64(u64x2); } // Conditionally multiply the packed double-precision (64-bit) floating-point // elements in a and b using the high 4 bits in imm8, sum the four products, and // conditionally store the sum in dst using the low 4 bits of imm8. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) { // Generate mask value from constant immediate bit value const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0; const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0; #if !SSE2NEON_PRECISE_DP const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0; const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0; #endif // Conditional multiplication #if !SSE2NEON_PRECISE_DP __m128d mul = _mm_mul_pd(a, b); const __m128d mulMask = _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask)); __m128d tmp = _mm_and_pd(mul, mulMask); #else #if defined(__aarch64__) double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) * vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0) : 0; double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) * vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1) : 0; #else double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0; double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0; #endif __m128d tmp = _mm_set_pd(d1, d0); #endif // Sum the products #if defined(__aarch64__) double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); #else double sum = *((double *) &tmp) + *(((double *) &tmp) + 1); #endif // Conditionally store the sum const __m128d sumMask = _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask)); __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask); return res; } // Conditionally multiply the packed single-precision (32-bit) floating-point // elements in a and b using the high 4 bits in imm8, sum the four products, // and conditionally store the sum in dst using the low 4 bits of imm. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) { #if defined(__aarch64__) /* shortcuts */ if (imm == 0xFF) { return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); } if (imm == 0x7F) { float32x4_t m = _mm_mul_ps(a, b); m[3] = 0; return _mm_set1_ps(vaddvq_f32(m)); } #endif float s = 0, c = 0; float32x4_t f32a = vreinterpretq_f32_m128(a); float32x4_t f32b = vreinterpretq_f32_m128(b); /* To improve the accuracy of floating-point summation, Kahan algorithm * is used for each operation. */ if (imm & (1 << 4)) _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); if (imm & (1 << 5)) _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); if (imm & (1 << 6)) _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); if (imm & (1 << 7)) _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); s += c; float32x4_t res = { (imm & 0x1) ? s : 0, (imm & 0x2) ? s : 0, (imm & 0x4) ? s : 0, (imm & 0x8) ? s : 0, }; return vreinterpretq_m128_f32(res); } // Extracts the selected signed or unsigned 32-bit integer from a and zero // extends. // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) #define _mm_extract_epi32(a, imm) \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) // Extracts the selected signed or unsigned 64-bit integer from a and zero // extends. // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) #define _mm_extract_epi64(a, imm) \ vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) // Extracts the selected signed or unsigned 8-bit integer from a and zero // extends. // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) // Extracts the selected single-precision (32-bit) floating-point from a. // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) // Round the packed double-precision (64-bit) floating-point elements in a down // to an integer value, and store the results as packed double-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd FORCE_INLINE __m128d _mm_floor_pd(__m128d a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); #else double *f = (double *) &a; return _mm_set_pd(FLOOR(f[1]), FLOOR(f[0])); #endif } // Round the packed single-precision (32-bit) floating-point elements in a down // to an integer value, and store the results as packed single-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps FORCE_INLINE __m128 _mm_floor_ps(__m128 a) { #if defined(__aarch64__) return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); #else float *f = (float *) &a; return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); #endif } // Round the lower double-precision (64-bit) floating-point element in b down to // an integer value, store the result as a double-precision floating-point // element in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_floor_pd(b)); } // Round the lower single-precision (32-bit) floating-point element in b down to // an integer value, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. // // dst[31:0] := FLOOR(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_floor_ps(b)); } // Inserts the least significant 32 bits of b into the selected 32-bit integer // of a. // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, // __constrange(0,4) int imm) #define _mm_insert_epi32(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s32( \ vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ }) // Inserts the least significant 64 bits of b into the selected 64-bit integer // of a. // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, // __constrange(0,2) int imm) #define _mm_insert_epi64(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s64( \ vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ }) // Inserts the least significant 8 bits of b into the selected 8-bit integer // of a. // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, // __constrange(0,16) int imm) #define _mm_insert_epi8(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s8( \ vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ }) // Copy a to tmp, then insert a single-precision (32-bit) floating-point // element from b into tmp using the control in imm8. Store tmp to dst using // the mask in imm8 (elements are zeroed out when the corresponding bit is set). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps #define _mm_insert_ps(a, b, imm8) \ __extension__({ \ float32x4_t tmp1 = \ vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \ vreinterpretq_f32_m128(a), 0); \ float32x4_t tmp2 = \ vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \ ((imm8 >> 4) & 0x3)); \ const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \ uint32x4_t mask = vld1q_u32(data); \ float32x4_t all_zeros = vdupq_n_f32(0); \ \ vreinterpretq_m128_f32( \ vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \ }) // epi versions of min/max // Computes the pariwise maximums of the four signed 32-bit integer values of a // and b. // // A 128-bit parameter that can be defined with the following equations: // r0 := (a0 > b0) ? a0 : b0 // r1 := (a1 > b1) ? a1 : b1 // r2 := (a2 > b2) ? a2 : b2 // r3 := (a3 > b3) ? a3 : b3 // // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed signed 8-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed unsigned 16-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Compare packed unsigned 32-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); } // Computes the pariwise minima of the four signed 32-bit integer values of a // and b. // // A 128-bit parameter that can be defined with the following equations: // r0 := (a0 < b0) ? a0 : b0 // r1 := (a1 < b1) ? a1 : b1 // r2 := (a2 < b2) ? a2 : b2 // r3 := (a3 < b3) ? a3 : b3 // // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed signed 8-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed unsigned 16-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Compare packed unsigned 32-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); } // Horizontally compute the minimum amongst the packed unsigned 16-bit integers // in a, store the minimum and index in dst, and zero the remaining bits in dst. // // index[2:0] := 0 // min[15:0] := a[15:0] // FOR j := 0 to 7 // i := j*16 // IF a[i+15:i] < min[15:0] // index[2:0] := j // min[15:0] := a[i+15:i] // FI // ENDFOR // dst[15:0] := min[15:0] // dst[18:16] := index[2:0] // dst[127:19] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) { __m128i dst; uint16_t min, idx = 0; // Find the minimum value #if defined(__aarch64__) min = vminvq_u16(vreinterpretq_u16_m128i(a)); #else __m64 tmp; tmp = vreinterpret_m64_u16( vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), vget_high_u16(vreinterpretq_u16_m128i(a)))); tmp = vreinterpret_m64_u16( vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); tmp = vreinterpret_m64_u16( vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); #endif // Get the index of the minimum value int i; for (i = 0; i < 8; i++) { if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { idx = (uint16_t) i; break; } a = _mm_srli_si128(a, 2); } // Generate result dst = _mm_setzero_si128(); dst = vreinterpretq_m128i_u16( vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); dst = vreinterpretq_m128i_u16( vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); return dst; } // Compute the sum of absolute differences (SADs) of quadruplets of unsigned // 8-bit integers in a compared to those in b, and store the 16-bit results in // dst. Eight SADs are performed using one quadruplet from b and eight // quadruplets from a. One quadruplet is selected from b starting at on the // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit // integers selected from a starting at the offset specified in imm8. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8 FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) { uint8x16_t _a, _b; switch (imm & 0x4) { case 0: // do nothing _a = vreinterpretq_u8_m128i(a); break; case 4: _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(a), 1)); break; default: #if defined(__GNUC__) || defined(__clang__) __builtin_unreachable(); #endif break; } switch (imm & 0x3) { case 0: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0))); break; case 1: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1))); break; case 2: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2))); break; case 3: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3))); break; default: #if defined(__GNUC__) || defined(__clang__) __builtin_unreachable(); #endif break; } int16x8_t c04, c15, c26, c37; uint8x8_t low_b = vget_low_u8(_b); c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); _a = vextq_u8(_a, _a, 1); c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); _a = vextq_u8(_a, _a, 1); c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); _a = vextq_u8(_a, _a, 1); c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); #if defined(__aarch64__) // |0|4|2|6| c04 = vpaddq_s16(c04, c26); // |1|5|3|7| c15 = vpaddq_s16(c15, c37); int32x4_t trn1_c = vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); int32x4_t trn2_c = vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c), vreinterpretq_s16_s32(trn2_c))); #else int16x4_t c01, c23, c45, c67; c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15)); c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37)); c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15)); c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37)); return vreinterpretq_m128i_s16( vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67))); #endif } // Multiply the low signed 32-bit integers from each packed 64-bit element in // a and b, and store the signed 64-bit results in dst. // // r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 // r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) { // vmull_s32 upcasts instead of masking, so we downcast. int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); } // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or // unsigned 32-bit integers from b. // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit // integers and saturates. // // r0 := UnsignedSaturate(a0) // r1 := UnsignedSaturate(a1) // r2 := UnsignedSaturate(a2) // r3 := UnsignedSaturate(a3) // r4 := UnsignedSaturate(b0) // r5 := UnsignedSaturate(b1) // r6 := UnsignedSaturate(b2) // r7 := UnsignedSaturate(b3) FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), vqmovun_s32(vreinterpretq_s32_m128i(b)))); } // Round the packed double-precision (64-bit) floating-point elements in a using // the rounding parameter, and store the results as packed double-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) { #if defined(__aarch64__) switch (rounding) { case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a))); case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): return _mm_floor_pd(a); case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): return _mm_ceil_pd(a); case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a))); default: //_MM_FROUND_CUR_DIRECTION return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a))); } #else double *v_double = (double *) &a; if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { double res[2], tmp; for (int i = 0; i < 2; i++) { tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i]; double roundDown = FLOOR(tmp); // Round down value double roundUp = CEIL(tmp); // Round up value double diffDown = tmp - roundDown; double diffUp = roundUp - tmp; if (diffDown < diffUp) { /* If it's closer to the round down value, then use it */ res[i] = roundDown; } else if (diffDown > diffUp) { /* If it's closer to the round up value, then use it */ res[i] = roundUp; } else { /* If it's equidistant between round up and round down value, * pick the one which is an even number */ double half = roundDown / 2; if (half != FLOOR(half)) { /* If the round down value is odd, return the round up value */ res[i] = roundUp; } else { /* If the round up value is odd, return the round down value */ res[i] = roundDown; } } res[i] = (v_double[i] < 0) ? -res[i] : res[i]; } return _mm_set_pd(res[1], res[0]); } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { return _mm_floor_pd(a); } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { return _mm_ceil_pd(a); } return _mm_set_pd(v_double[1] > 0 ? FLOOR(v_double[1]) : CEIL(v_double[1]), v_double[0] > 0 ? FLOOR(v_double[0]) : CEIL(v_double[0])); #endif } // Round the packed single-precision (32-bit) floating-point elements in a using // the rounding parameter, and store the results as packed single-precision // floating-point elements in dst. // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) { #if defined(__aarch64__) switch (rounding) { case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): return _mm_floor_ps(a); case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): return _mm_ceil_ps(a); case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); default: //_MM_FROUND_CUR_DIRECTION return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); } #else float *v_float = (float *) &a; if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { uint32x4_t signmask = vdupq_n_u32(0x80000000); float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */ int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ int32x4_t r_trunc = vcvtq_s32_f32( vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ float32x4_t delta = vsubq_f32( vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ return vreinterpretq_m128_f32( vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal))); } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { return _mm_floor_ps(a); } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { return _mm_ceil_ps(a); } return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]), v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]), v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]), v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0])); #endif } // Round the lower double-precision (64-bit) floating-point element in b using // the rounding parameter, store the result as a double-precision floating-point // element in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) { return _mm_move_sd(a, _mm_round_pd(b, rounding)); } // Round the lower single-precision (32-bit) floating-point element in b using // the rounding parameter, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. Rounding is done according to the // rounding[3:0] parameter, which can be one of: // (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and // suppress exceptions // (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and // suppress exceptions // (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress // exceptions // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress // exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see // _MM_SET_ROUNDING_MODE // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) { return _mm_move_ss(a, _mm_round_ps(b, rounding)); } // Load 128-bits of integer data from memory into dst using a non-temporal // memory hint. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // dst[127:0] := MEM[mem_addr+127:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) { #if __has_builtin(__builtin_nontemporal_store) return __builtin_nontemporal_load(p); #else return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); #endif } // Compute the bitwise NOT of a and then AND with a 128-bit vector containing // all 1's, and return 1 if the result is zero, otherwise return 0. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones FORCE_INLINE int _mm_test_all_ones(__m128i a) { return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == ~(uint64_t) 0; } // Compute the bitwise AND of 128 bits (representing integer data) in a and // mask, and return 1 if the result is zero, otherwise return 0. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) { int64x2_t a_and_mask = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and // mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, // otherwise return 0. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) { uint64x2_t zf = vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a)); uint64x2_t cf = vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a)); uint64x2_t result = vandq_u64(zf, cf); return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return the CF value. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) { int64x2_t s64 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))), vreinterpretq_s64_m128i(b)); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, // otherwise return 0. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return the ZF value. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) { int64x2_t s64 = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } /* SSE4.2 */ // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers // in b for greater than. FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_u64( vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else return vreinterpretq_m128i_s64(vshrq_n_s64( vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)), 63)); #endif } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 16-bit integer v. // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc = _mm_crc32_u8(crc, v & 0xff); crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 32-bit integer v. // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc = _mm_crc32_u16(crc, v & 0xffff); crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 64-bit integer v. // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 8-bit integer v. // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc ^= v; for (int bit = 0; bit < 8; bit++) { if (crc & 1) crc = (crc >> 1) ^ UINT32_C(0x82f63b78); else crc = (crc >> 1); } #endif return crc; } /* AES */ #if !defined(__ARM_FEATURE_CRYPTO) /* clang-format off */ #define SSE2NEON_AES_DATA(w) \ { \ w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ w(0xb0), w(0x54), w(0xbb), w(0x16) \ } /* clang-format on */ /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ #define SSE2NEON_AES_H0(x) (x) static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0); #undef SSE2NEON_AES_H0 // In the absence of crypto extensions, implement aesenc using regular neon // intrinsics instead. See: // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52 // for more information Reproduced with permission of the author. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey) { #if defined(__aarch64__) static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb}; static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc}; uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(EncBlock); // shift rows w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); // sub bytes v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0); // mix columns w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b); w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); // add round key return vreinterpretq_m128i_u8(w) ^ RoundKey; #else /* ARMv7-A NEON implementation */ #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ (b0)) #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) #define SSE2NEON_AES_U0(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) #define SSE2NEON_AES_U1(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) #define SSE2NEON_AES_U2(p) \ SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) #define SSE2NEON_AES_U3(p) \ SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { SSE2NEON_AES_DATA(SSE2NEON_AES_U0), SSE2NEON_AES_DATA(SSE2NEON_AES_U1), SSE2NEON_AES_DATA(SSE2NEON_AES_U2), SSE2NEON_AES_DATA(SSE2NEON_AES_U3), }; #undef SSE2NEON_AES_B2W #undef SSE2NEON_AES_F2 #undef SSE2NEON_AES_F3 #undef SSE2NEON_AES_U0 #undef SSE2NEON_AES_U1 #undef SSE2NEON_AES_U2 #undef SSE2NEON_AES_U3 uint32_t x0 = _mm_cvtsi128_si32(EncBlock); uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55)); uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA)); uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF)); __m128i out = _mm_set_epi32( (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); return _mm_xor_si128(out, RoundKey); #endif } // Perform the last round of an AES encryption flow on data (state) in a using // the round key in RoundKey, and store the result in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { /* FIXME: optimized for NEON */ uint8_t v[4][4] = { {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, }; for (int i = 0; i < 16; i++) vreinterpretq_nth_u8_m128i(a, i) = v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i); return a; } // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. // This instruction generates a round key for AES encryption. See // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ // for details. // // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) { uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); for (int i = 0; i < 4; ++i) { ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]]; ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]]; } return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); } #undef SSE2NEON_AES_DATA #else /* __ARM_FEATURE_CRYPTO */ // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and // AESMC and then manually applying the real key as an xor operation. This // unfortunately means an additional xor op; the compiler should be able to // optimize this away for repeated calls however. See // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a // for more details. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^ vreinterpretq_u8_m128i(b)); } // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), RoundKey); } FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) { // AESE does ShiftRows and SubBytes on A uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); uint8x16_t dest = { // Undo ShiftRows step from AESE and extract X1 and X3 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) }; uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); } #endif /* Others */ // Perform a carry-less multiplication of two 64-bit integers, selected from a // and b according to imm8, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) { uint64x2_t a = vreinterpretq_u64_m128i(_a); uint64x2_t b = vreinterpretq_u64_m128i(_b); switch (imm & 0x11) { case 0x00: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); case 0x01: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); case 0x10: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); case 0x11: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); default: abort(); } } FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode() { union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF; } // Count the number of bits set to 1 in unsigned 32-bit integer a, and // return that count in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 FORCE_INLINE int _mm_popcnt_u32(unsigned int a) { #if defined(__aarch64__) #if __has_builtin(__builtin_popcount) return __builtin_popcount(a); #else return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); #endif #else uint32_t count = 0; uint8x8_t input_val, count8x8_val; uint16x4_t count16x4_val; uint32x2_t count32x2_val; input_val = vld1_u8((uint8_t *) &a); count8x8_val = vcnt_u8(input_val); count16x4_val = vpaddl_u8(count8x8_val); count32x2_val = vpaddl_u16(count16x4_val); vst1_u32(&count, count32x2_val); return count; #endif } // Count the number of bits set to 1 in unsigned 64-bit integer a, and // return that count in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) { #if defined(__aarch64__) #if __has_builtin(__builtin_popcountll) return __builtin_popcountll(a); #else return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); #endif #else uint64_t count = 0; uint8x8_t input_val, count8x8_val; uint16x4_t count16x4_val; uint32x2_t count32x2_val; uint64x1_t count64x1_val; input_val = vld1_u8((uint8_t *) &a); count8x8_val = vcnt_u8(input_val); count16x4_val = vpaddl_u8(count8x8_val); count32x2_val = vpaddl_u16(count16x4_val); count64x1_val = vpaddl_u32(count32x2_val); vst1_u64(&count, count64x1_val); return count; #endif } FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON; #if defined(__aarch64__) __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */ #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } #if defined(__GNUC__) || defined(__clang__) #pragma pop_macro("ALIGN_STRUCT") #pragma pop_macro("FORCE_INLINE") #endif #if defined(__GNUC__) && !defined(__clang__) #pragma GCC pop_options #endif #endif RandomFieldsUtils/inst/include/solve_gpu.h0000644000176200001440000000066314227157055020433 0ustar liggesusers #ifndef RFutils_gpusolve #define RFutils_gpusolve 1 int cholGPU(bool copy, double *matrix, Uint size, double *B, Uint rhs_cols, double *LogDet, double *RESULT); void mgpuSolve(double *matrix, Uint individuals, double *vector); void gpu_relmat_custom(Uint*, double*, Uint, Uint); void gpu_relmat_cublas(Uint*, double*, Uint, Uint); // #define PADDIM 4L //#define BLOCKS 1024 #define THREADS_PER_BLOCK 1024 //2048 / 32 #endif RandomFieldsUtils/inst/include/win_linux_aux.h0000644000176200001440000000222314227157055021313 0ustar liggesusers /* Authors Martin Schlather, schlather@math.uni-mannheim.de Copyright (C) 2015 -- 2021 Martin Schlather This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef WIN_LINUX_AUX_H #define WIN_LINUX_AUX_H 1 uint32_t cpuid_info(int Blatt, int Register);//MINGWCPUID, WINCPUID, LINUXCPUID #ifdef __cplusplus extern "C" { #endif void sleepMilli(int *milli); void sleepMicro(int *milli); void pid(int *i); void hostname(char **h, int *i); bool parallel(); #ifdef __cplusplus } #endif #endif /* WIN_LINUX_AUX_H */ RandomFieldsUtils/inst/CITATION0000644000176200001440000000161614227157055015770 0ustar liggesuserscitHeader("To cite RandomFieldsUtils in publications use:") year <- sub("-.*", "", meta$Date) note <- sprintf("R package version %s", meta$Version) bibentry(bibtype="Manual", header="For general purposes please refer to", footer="", title = "{RandomFieldsUtils}: Utilites for the Simulation and Analysis of Random Fields", author = c(person("Martin", "Schlather", role=c("cre", "aut")), person("Alexander", "Freudenberg", role="aut"), person("Reinhard", "Furrer", role="ctb"), person("Martin", "Kroll", role="ctb"), person(given=c("Brian", "D"), "Ripley", role="ctb"), person(given=c("John", "W."), "Ratcliff", role="cph") ), year = year, note = note, url = "https://cran.r-project.org/package=RandomFieldsUtils" ) RandomFieldsUtils/cleanup0000755000176200001440000000006514227157056015231 0ustar liggesusers#!/bin/sh rm -rf config.* src/Makevars src/config.h RandomFieldsUtils/configure0000755000176200001440000036167314227157056015602 0ustar liggesusers#! /bin/sh # Guess values for system-dependent variables and create Makefiles. # Generated by GNU Autoconf 2.69 for RandomFieldsUtils 1.0. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # # # This configure script is free software; the Free Software Foundation # gives unlimited permission to copy, distribute and modify it. ## -------------------- ## ## M4sh Initialization. ## ## -------------------- ## # Be more Bourne compatible DUALCASE=1; export DUALCASE # for MKS sh if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : emulate sh NULLCMD=: # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which # is contrary to our usage. Disable this feature. alias -g '${1+"$@"}'='"$@"' setopt NO_GLOB_SUBST else case `(set -o) 2>/dev/null` in #( *posix*) : set -o posix ;; #( *) : ;; esac fi as_nl=' ' export as_nl # Printing a long string crashes Solaris 7 /usr/bin/printf. as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo # Prefer a ksh shell builtin over an external printf program on Solaris, # but without wasting forks for bash or zsh. if test -z "$BASH_VERSION$ZSH_VERSION" \ && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then as_echo='print -r --' as_echo_n='print -rn --' elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then as_echo='printf %s\n' as_echo_n='printf %s' else if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' as_echo_n='/usr/ucb/echo -n' else as_echo_body='eval expr "X$1" : "X\\(.*\\)"' as_echo_n_body='eval arg=$1; case $arg in #( *"$as_nl"*) expr "X$arg" : "X\\(.*\\)$as_nl"; arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; esac; expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" ' export as_echo_n_body as_echo_n='sh -c $as_echo_n_body as_echo' fi export as_echo_body as_echo='sh -c $as_echo_body as_echo' fi # The user is always right. if test "${PATH_SEPARATOR+set}" != set; then PATH_SEPARATOR=: (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || PATH_SEPARATOR=';' } fi # IFS # We need space, tab and new line, in precisely that order. Quoting is # there to prevent editors from complaining about space-tab. # (If _AS_PATH_WALK were called with IFS unset, it would disable word # splitting by setting IFS to empty value.) IFS=" "" $as_nl" # Find who we are. Look in the path if we contain no directory separator. as_myself= case $0 in #(( *[\\/]* ) as_myself=$0 ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break done IFS=$as_save_IFS ;; esac # We did not find ourselves, most probably we were run as `sh COMMAND' # in which case we are not to be found in the path. if test "x$as_myself" = x; then as_myself=$0 fi if test ! -f "$as_myself"; then $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 exit 1 fi # Unset variables that we do not need and which cause bugs (e.g. in # pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" # suppresses any "Segmentation fault" message there. '((' could # trigger a bug in pdksh 5.2.14. for as_var in BASH_ENV ENV MAIL MAILPATH do eval test x\${$as_var+set} = xset \ && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : done PS1='$ ' PS2='> ' PS4='+ ' # NLS nuisances. LC_ALL=C export LC_ALL LANGUAGE=C export LANGUAGE # CDPATH. (unset CDPATH) >/dev/null 2>&1 && unset CDPATH # Use a proper internal environment variable to ensure we don't fall # into an infinite loop, continuously re-executing ourselves. if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then _as_can_reexec=no; export _as_can_reexec; # We cannot yet assume a decent shell, so we have to provide a # neutralization value for shells without unset; and this also # works around shells that cannot unset nonexistent variables. # Preserve -v and -x to the replacement shell. BASH_ENV=/dev/null ENV=/dev/null (unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV case $- in # (((( *v*x* | *x*v* ) as_opts=-vx ;; *v* ) as_opts=-v ;; *x* ) as_opts=-x ;; * ) as_opts= ;; esac exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} # Admittedly, this is quite paranoid, since all the known shells bail # out after a failed `exec'. $as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 as_fn_exit 255 fi # We don't want this to propagate to other subprocesses. { _as_can_reexec=; unset _as_can_reexec;} if test "x$CONFIG_SHELL" = x; then as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then : emulate sh NULLCMD=: # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which # is contrary to our usage. Disable this feature. alias -g '\${1+\"\$@\"}'='\"\$@\"' setopt NO_GLOB_SUBST else case \`(set -o) 2>/dev/null\` in #( *posix*) : set -o posix ;; #( *) : ;; esac fi " as_required="as_fn_return () { (exit \$1); } as_fn_success () { as_fn_return 0; } as_fn_failure () { as_fn_return 1; } as_fn_ret_success () { return 0; } as_fn_ret_failure () { return 1; } exitcode=0 as_fn_success || { exitcode=1; echo as_fn_success failed.; } as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; } as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; } as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; } if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then : else exitcode=1; echo positional parameters were not saved. fi test x\$exitcode = x0 || exit 1 test -x / || exit 1" as_suggested=" as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" && test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1" if (eval "$as_required") 2>/dev/null; then : as_have_required=yes else as_have_required=no fi if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then : else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR as_found=false for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. as_found=: case $as_dir in #( /*) for as_base in sh bash ksh sh5; do # Try only shells that exist, to save several forks. as_shell=$as_dir/$as_base if { test -f "$as_shell" || test -f "$as_shell.exe"; } && { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then : CONFIG_SHELL=$as_shell as_have_required=yes if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then : break 2 fi fi done;; esac as_found=false done $as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } && { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then : CONFIG_SHELL=$SHELL as_have_required=yes fi; } IFS=$as_save_IFS if test "x$CONFIG_SHELL" != x; then : export CONFIG_SHELL # We cannot yet assume a decent shell, so we have to provide a # neutralization value for shells without unset; and this also # works around shells that cannot unset nonexistent variables. # Preserve -v and -x to the replacement shell. BASH_ENV=/dev/null ENV=/dev/null (unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV case $- in # (((( *v*x* | *x*v* ) as_opts=-vx ;; *v* ) as_opts=-v ;; *x* ) as_opts=-x ;; * ) as_opts= ;; esac exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} # Admittedly, this is quite paranoid, since all the known shells bail # out after a failed `exec'. $as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 exit 255 fi if test x$as_have_required = xno; then : $as_echo "$0: This script requires a shell more modern than all" $as_echo "$0: the shells that I found on your system." if test x${ZSH_VERSION+set} = xset ; then $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should" $as_echo "$0: be upgraded to zsh 4.3.4 or later." else $as_echo "$0: Please tell bug-autoconf@gnu.org about your system, $0: including any error possibly output before this $0: message. Then install a modern shell, or manually run $0: the script under such a shell if you do have one." fi exit 1 fi fi fi SHELL=${CONFIG_SHELL-/bin/sh} export SHELL # Unset more variables known to interfere with behavior of common tools. CLICOLOR_FORCE= GREP_OPTIONS= unset CLICOLOR_FORCE GREP_OPTIONS ## --------------------- ## ## M4sh Shell Functions. ## ## --------------------- ## # as_fn_unset VAR # --------------- # Portably unset VAR. as_fn_unset () { { eval $1=; unset $1;} } as_unset=as_fn_unset # as_fn_set_status STATUS # ----------------------- # Set $? to STATUS, without forking. as_fn_set_status () { return $1 } # as_fn_set_status # as_fn_exit STATUS # ----------------- # Exit the shell with STATUS, even in a "trap 0" or "set -e" context. as_fn_exit () { set +e as_fn_set_status $1 exit $1 } # as_fn_exit # as_fn_mkdir_p # ------------- # Create "$as_dir" as a directory, including parents if necessary. as_fn_mkdir_p () { case $as_dir in #( -*) as_dir=./$as_dir;; esac test -d "$as_dir" || eval $as_mkdir_p || { as_dirs= while :; do case $as_dir in #( *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( *) as_qdir=$as_dir;; esac as_dirs="'$as_qdir' $as_dirs" as_dir=`$as_dirname -- "$as_dir" || $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_dir" : 'X\(//\)[^/]' \| \ X"$as_dir" : 'X\(//\)$' \| \ X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || $as_echo X"$as_dir" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q } /^X\(\/\/\)[^/].*/{ s//\1/ q } /^X\(\/\/\)$/{ s//\1/ q } /^X\(\/\).*/{ s//\1/ q } s/.*/./; q'` test -d "$as_dir" && break done test -z "$as_dirs" || eval "mkdir $as_dirs" } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" } # as_fn_mkdir_p # as_fn_executable_p FILE # ----------------------- # Test if FILE is an executable regular file. as_fn_executable_p () { test -f "$1" && test -x "$1" } # as_fn_executable_p # as_fn_append VAR VALUE # ---------------------- # Append the text in VALUE to the end of the definition contained in VAR. Take # advantage of any shell optimizations that allow amortized linear growth over # repeated appends, instead of the typical quadratic growth present in naive # implementations. if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : eval 'as_fn_append () { eval $1+=\$2 }' else as_fn_append () { eval $1=\$$1\$2 } fi # as_fn_append # as_fn_arith ARG... # ------------------ # Perform arithmetic evaluation on the ARGs, and store the result in the # global $as_val. Take advantage of shells that can avoid forks. The arguments # must be portable across $(()) and expr. if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : eval 'as_fn_arith () { as_val=$(( $* )) }' else as_fn_arith () { as_val=`expr "$@" || test $? -eq 1` } fi # as_fn_arith # as_fn_error STATUS ERROR [LINENO LOG_FD] # ---------------------------------------- # Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are # provided, also output the error to LOG_FD, referencing LINENO. Then exit the # script with STATUS, using 1 if that was 0. as_fn_error () { as_status=$1; test $as_status -eq 0 && as_status=1 if test "$4"; then as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 fi $as_echo "$as_me: error: $2" >&2 as_fn_exit $as_status } # as_fn_error if expr a : '\(a\)' >/dev/null 2>&1 && test "X`expr 00001 : '.*\(...\)'`" = X001; then as_expr=expr else as_expr=false fi if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then as_basename=basename else as_basename=false fi if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then as_dirname=dirname else as_dirname=false fi as_me=`$as_basename -- "$0" || $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ X"$0" : 'X\(//\)$' \| \ X"$0" : 'X\(/\)' \| . 2>/dev/null || $as_echo X/"$0" | sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/ q } /^X\/\(\/\/\)$/{ s//\1/ q } /^X\/\(\/\).*/{ s//\1/ q } s/.*/./; q'` # Avoid depending upon Character Ranges. as_cr_letters='abcdefghijklmnopqrstuvwxyz' as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' as_cr_Letters=$as_cr_letters$as_cr_LETTERS as_cr_digits='0123456789' as_cr_alnum=$as_cr_Letters$as_cr_digits as_lineno_1=$LINENO as_lineno_1a=$LINENO as_lineno_2=$LINENO as_lineno_2a=$LINENO eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" && test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || { # Blame Lee E. McMahon (1931-1989) for sed's syntax. :-) sed -n ' p /[$]LINENO/= ' <$as_myself | sed ' s/[$]LINENO.*/&-/ t lineno b :lineno N :loop s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ t loop s/-\n.*// ' >$as_me.lineno && chmod +x "$as_me.lineno" || { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; } # If we had to re-execute with $CONFIG_SHELL, we're ensured to have # already done that, so ensure we don't try to do so again and fall # in an infinite loop. This has already happened in practice. _as_can_reexec=no; export _as_can_reexec # Don't try to exec as it changes $[0], causing all sort of problems # (the dirname of $[0] is not the place where we might find the # original and so on. Autoconf is especially sensitive to this). . "./$as_me.lineno" # Exit status is that of the last command. exit } ECHO_C= ECHO_N= ECHO_T= case `echo -n x` in #((((( -n*) case `echo 'xy\c'` in *c*) ECHO_T=' ';; # ECHO_T is single tab character. xy) ECHO_C='\c';; *) echo `echo ksh88 bug on AIX 6.1` > /dev/null ECHO_T=' ';; esac;; *) ECHO_N='-n';; esac rm -f conf$$ conf$$.exe conf$$.file if test -d conf$$.dir; then rm -f conf$$.dir/conf$$.file else rm -f conf$$.dir mkdir conf$$.dir 2>/dev/null fi if (echo >conf$$.file) 2>/dev/null; then if ln -s conf$$.file conf$$ 2>/dev/null; then as_ln_s='ln -s' # ... but there are two gotchas: # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. # In both cases, we have to default to `cp -pR'. ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || as_ln_s='cp -pR' elif ln conf$$.file conf$$ 2>/dev/null; then as_ln_s=ln else as_ln_s='cp -pR' fi else as_ln_s='cp -pR' fi rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file rmdir conf$$.dir 2>/dev/null if mkdir -p . 2>/dev/null; then as_mkdir_p='mkdir -p "$as_dir"' else test -d ./-p && rmdir ./-p as_mkdir_p=false fi as_test_x='test -x' as_executable_p=as_fn_executable_p # Sed expression to map a string onto a valid CPP name. as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" # Sed expression to map a string onto a valid variable name. as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" test -n "$DJDIR" || exec 7<&0 &1 # Name of the host. # hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status, # so uname gets run too. ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q` # # Initializations. # ac_default_prefix=/usr/local ac_clean_files= ac_config_libobj_dir=. LIBOBJS= cross_compiling=no subdirs= MFLAGS= MAKEFLAGS= # Identity of this package. PACKAGE_NAME='RandomFieldsUtils' PACKAGE_TARNAME='randomfieldsutils' PACKAGE_VERSION='1.0' PACKAGE_STRING='RandomFieldsUtils 1.0' PACKAGE_BUGREPORT='' PACKAGE_URL='' ac_subst_vars='LTLIBOBJS LIBOBJS MY_C_FILES MY_CU_FILES MY_AVX512F MY_AVX2 MY_AVX MY_SSE41 MY_SSSE3 MY_SSE3 MY_SSE2 MY_CUDA_HOME MY_CUDA_LIBS MY_LIB_FLAGS MY_PKG_FLAGS SYSCTL LSCPU OBJEXT EXEEXT ac_ct_CXX CPPFLAGS LDFLAGS CXXFLAGS CXX target_alias host_alias build_alias LIBS ECHO_T ECHO_N ECHO_C DEFS mandir localedir libdir psdir pdfdir dvidir htmldir infodir docdir oldincludedir includedir runstatedir localstatedir sharedstatedir sysconfdir datadir datarootdir libexecdir sbindir bindir program_transform_name prefix exec_prefix PACKAGE_URL PACKAGE_BUGREPORT PACKAGE_STRING PACKAGE_VERSION PACKAGE_TARNAME PACKAGE_NAME PATH_SEPARATOR SHELL' ac_subst_files='' ac_user_opts=' enable_option_checking ' ac_precious_vars='build_alias host_alias target_alias CXX CXXFLAGS LDFLAGS LIBS CPPFLAGS CCC' # Initialize some variables set by options. ac_init_help= ac_init_version=false ac_unrecognized_opts= ac_unrecognized_sep= # The variables have the same names as the options, with # dashes changed to underlines. cache_file=/dev/null exec_prefix=NONE no_create= no_recursion= prefix=NONE program_prefix=NONE program_suffix=NONE program_transform_name=s,x,x, silent= site= srcdir= verbose= x_includes=NONE x_libraries=NONE # Installation directory options. # These are left unexpanded so users can "make install exec_prefix=/foo" # and all the variables that are supposed to be based on exec_prefix # by default will actually change. # Use braces instead of parens because sh, perl, etc. also accept them. # (The list follows the same order as the GNU Coding Standards.) bindir='${exec_prefix}/bin' sbindir='${exec_prefix}/sbin' libexecdir='${exec_prefix}/libexec' datarootdir='${prefix}/share' datadir='${datarootdir}' sysconfdir='${prefix}/etc' sharedstatedir='${prefix}/com' localstatedir='${prefix}/var' runstatedir='${localstatedir}/run' includedir='${prefix}/include' oldincludedir='/usr/include' docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' infodir='${datarootdir}/info' htmldir='${docdir}' dvidir='${docdir}' pdfdir='${docdir}' psdir='${docdir}' libdir='${exec_prefix}/lib' localedir='${datarootdir}/locale' mandir='${datarootdir}/man' ac_prev= ac_dashdash= for ac_option do # If the previous option needs an argument, assign it. if test -n "$ac_prev"; then eval $ac_prev=\$ac_option ac_prev= continue fi case $ac_option in *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;; *=) ac_optarg= ;; *) ac_optarg=yes ;; esac # Accept the important Cygnus configure options, so we can diagnose typos. case $ac_dashdash$ac_option in --) ac_dashdash=yes ;; -bindir | --bindir | --bindi | --bind | --bin | --bi) ac_prev=bindir ;; -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) bindir=$ac_optarg ;; -build | --build | --buil | --bui | --bu) ac_prev=build_alias ;; -build=* | --build=* | --buil=* | --bui=* | --bu=*) build_alias=$ac_optarg ;; -cache-file | --cache-file | --cache-fil | --cache-fi \ | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) ac_prev=cache_file ;; -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) cache_file=$ac_optarg ;; --config-cache | -C) cache_file=config.cache ;; -datadir | --datadir | --datadi | --datad) ac_prev=datadir ;; -datadir=* | --datadir=* | --datadi=* | --datad=*) datadir=$ac_optarg ;; -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \ | --dataroo | --dataro | --datar) ac_prev=datarootdir ;; -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \ | --dataroot=* | --dataroo=* | --dataro=* | --datar=*) datarootdir=$ac_optarg ;; -disable-* | --disable-*) ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && as_fn_error $? "invalid feature name: $ac_useropt" ac_useropt_orig=$ac_useropt ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in *" "enable_$ac_useropt" "*) ;; *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig" ac_unrecognized_sep=', ';; esac eval enable_$ac_useropt=no ;; -docdir | --docdir | --docdi | --doc | --do) ac_prev=docdir ;; -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*) docdir=$ac_optarg ;; -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv) ac_prev=dvidir ;; -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*) dvidir=$ac_optarg ;; -enable-* | --enable-*) ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && as_fn_error $? "invalid feature name: $ac_useropt" ac_useropt_orig=$ac_useropt ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in *" "enable_$ac_useropt" "*) ;; *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig" ac_unrecognized_sep=', ';; esac eval enable_$ac_useropt=\$ac_optarg ;; -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ | --exec | --exe | --ex) ac_prev=exec_prefix ;; -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ | --exec=* | --exe=* | --ex=*) exec_prefix=$ac_optarg ;; -gas | --gas | --ga | --g) # Obsolete; use --with-gas. with_gas=yes ;; -help | --help | --hel | --he | -h) ac_init_help=long ;; -help=r* | --help=r* | --hel=r* | --he=r* | -hr*) ac_init_help=recursive ;; -help=s* | --help=s* | --hel=s* | --he=s* | -hs*) ac_init_help=short ;; -host | --host | --hos | --ho) ac_prev=host_alias ;; -host=* | --host=* | --hos=* | --ho=*) host_alias=$ac_optarg ;; -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht) ac_prev=htmldir ;; -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \ | --ht=*) htmldir=$ac_optarg ;; -includedir | --includedir | --includedi | --included | --include \ | --includ | --inclu | --incl | --inc) ac_prev=includedir ;; -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ | --includ=* | --inclu=* | --incl=* | --inc=*) includedir=$ac_optarg ;; -infodir | --infodir | --infodi | --infod | --info | --inf) ac_prev=infodir ;; -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) infodir=$ac_optarg ;; -libdir | --libdir | --libdi | --libd) ac_prev=libdir ;; -libdir=* | --libdir=* | --libdi=* | --libd=*) libdir=$ac_optarg ;; -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ | --libexe | --libex | --libe) ac_prev=libexecdir ;; -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ | --libexe=* | --libex=* | --libe=*) libexecdir=$ac_optarg ;; -localedir | --localedir | --localedi | --localed | --locale) ac_prev=localedir ;; -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*) localedir=$ac_optarg ;; -localstatedir | --localstatedir | --localstatedi | --localstated \ | --localstate | --localstat | --localsta | --localst | --locals) ac_prev=localstatedir ;; -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*) localstatedir=$ac_optarg ;; -mandir | --mandir | --mandi | --mand | --man | --ma | --m) ac_prev=mandir ;; -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) mandir=$ac_optarg ;; -nfp | --nfp | --nf) # Obsolete; use --without-fp. with_fp=no ;; -no-create | --no-create | --no-creat | --no-crea | --no-cre \ | --no-cr | --no-c | -n) no_create=yes ;; -no-recursion | --no-recursion | --no-recursio | --no-recursi \ | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) no_recursion=yes ;; -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ | --oldin | --oldi | --old | --ol | --o) ac_prev=oldincludedir ;; -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) oldincludedir=$ac_optarg ;; -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) ac_prev=prefix ;; -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) prefix=$ac_optarg ;; -program-prefix | --program-prefix | --program-prefi | --program-pref \ | --program-pre | --program-pr | --program-p) ac_prev=program_prefix ;; -program-prefix=* | --program-prefix=* | --program-prefi=* \ | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) program_prefix=$ac_optarg ;; -program-suffix | --program-suffix | --program-suffi | --program-suff \ | --program-suf | --program-su | --program-s) ac_prev=program_suffix ;; -program-suffix=* | --program-suffix=* | --program-suffi=* \ | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) program_suffix=$ac_optarg ;; -program-transform-name | --program-transform-name \ | --program-transform-nam | --program-transform-na \ | --program-transform-n | --program-transform- \ | --program-transform | --program-transfor \ | --program-transfo | --program-transf \ | --program-trans | --program-tran \ | --progr-tra | --program-tr | --program-t) ac_prev=program_transform_name ;; -program-transform-name=* | --program-transform-name=* \ | --program-transform-nam=* | --program-transform-na=* \ | --program-transform-n=* | --program-transform-=* \ | --program-transform=* | --program-transfor=* \ | --program-transfo=* | --program-transf=* \ | --program-trans=* | --program-tran=* \ | --progr-tra=* | --program-tr=* | --program-t=*) program_transform_name=$ac_optarg ;; -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd) ac_prev=pdfdir ;; -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*) pdfdir=$ac_optarg ;; -psdir | --psdir | --psdi | --psd | --ps) ac_prev=psdir ;; -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*) psdir=$ac_optarg ;; -q | -quiet | --quiet | --quie | --qui | --qu | --q \ | -silent | --silent | --silen | --sile | --sil) silent=yes ;; -runstatedir | --runstatedir | --runstatedi | --runstated \ | --runstate | --runstat | --runsta | --runst | --runs \ | --run | --ru | --r) ac_prev=runstatedir ;; -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \ | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \ | --run=* | --ru=* | --r=*) runstatedir=$ac_optarg ;; -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) ac_prev=sbindir ;; -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ | --sbi=* | --sb=*) sbindir=$ac_optarg ;; -sharedstatedir | --sharedstatedir | --sharedstatedi \ | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ | --sharedst | --shareds | --shared | --share | --shar \ | --sha | --sh) ac_prev=sharedstatedir ;; -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ | --sha=* | --sh=*) sharedstatedir=$ac_optarg ;; -site | --site | --sit) ac_prev=site ;; -site=* | --site=* | --sit=*) site=$ac_optarg ;; -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) ac_prev=srcdir ;; -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) srcdir=$ac_optarg ;; -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ | --syscon | --sysco | --sysc | --sys | --sy) ac_prev=sysconfdir ;; -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) sysconfdir=$ac_optarg ;; -target | --target | --targe | --targ | --tar | --ta | --t) ac_prev=target_alias ;; -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) target_alias=$ac_optarg ;; -v | -verbose | --verbose | --verbos | --verbo | --verb) verbose=yes ;; -version | --version | --versio | --versi | --vers | -V) ac_init_version=: ;; -with-* | --with-*) ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && as_fn_error $? "invalid package name: $ac_useropt" ac_useropt_orig=$ac_useropt ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in *" "with_$ac_useropt" "*) ;; *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig" ac_unrecognized_sep=', ';; esac eval with_$ac_useropt=\$ac_optarg ;; -without-* | --without-*) ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && as_fn_error $? "invalid package name: $ac_useropt" ac_useropt_orig=$ac_useropt ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in *" "with_$ac_useropt" "*) ;; *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig" ac_unrecognized_sep=', ';; esac eval with_$ac_useropt=no ;; --x) # Obsolete; use --with-x. with_x=yes ;; -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ | --x-incl | --x-inc | --x-in | --x-i) ac_prev=x_includes ;; -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) x_includes=$ac_optarg ;; -x-libraries | --x-libraries | --x-librarie | --x-librari \ | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) ac_prev=x_libraries ;; -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) x_libraries=$ac_optarg ;; -*) as_fn_error $? "unrecognized option: \`$ac_option' Try \`$0 --help' for more information" ;; *=*) ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='` # Reject names that are not valid shell variable names. case $ac_envvar in #( '' | [0-9]* | *[!_$as_cr_alnum]* ) as_fn_error $? "invalid variable name: \`$ac_envvar'" ;; esac eval $ac_envvar=\$ac_optarg export $ac_envvar ;; *) # FIXME: should be removed in autoconf 3.0. $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2 expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2 : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}" ;; esac done if test -n "$ac_prev"; then ac_option=--`echo $ac_prev | sed 's/_/-/g'` as_fn_error $? "missing argument to $ac_option" fi if test -n "$ac_unrecognized_opts"; then case $enable_option_checking in no) ;; fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;; *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;; esac fi # Check all directory arguments for consistency. for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ datadir sysconfdir sharedstatedir localstatedir includedir \ oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ libdir localedir mandir runstatedir do eval ac_val=\$$ac_var # Remove trailing slashes. case $ac_val in */ ) ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'` eval $ac_var=\$ac_val;; esac # Be sure to have absolute directory names. case $ac_val in [\\/$]* | ?:[\\/]* ) continue;; NONE | '' ) case $ac_var in *prefix ) continue;; esac;; esac as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val" done # There might be people who depend on the old broken behavior: `$host' # used to hold the argument of --host etc. # FIXME: To remove some day. build=$build_alias host=$host_alias target=$target_alias # FIXME: To remove some day. if test "x$host_alias" != x; then if test "x$build_alias" = x; then cross_compiling=maybe elif test "x$build_alias" != "x$host_alias"; then cross_compiling=yes fi fi ac_tool_prefix= test -n "$host_alias" && ac_tool_prefix=$host_alias- test "$silent" = yes && exec 6>/dev/null ac_pwd=`pwd` && test -n "$ac_pwd" && ac_ls_di=`ls -di .` && ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` || as_fn_error $? "working directory cannot be determined" test "X$ac_ls_di" = "X$ac_pwd_ls_di" || as_fn_error $? "pwd does not report name of working directory" # Find the source files, if location was not specified. if test -z "$srcdir"; then ac_srcdir_defaulted=yes # Try the directory containing this script, then the parent directory. ac_confdir=`$as_dirname -- "$as_myself" || $as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_myself" : 'X\(//\)[^/]' \| \ X"$as_myself" : 'X\(//\)$' \| \ X"$as_myself" : 'X\(/\)' \| . 2>/dev/null || $as_echo X"$as_myself" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q } /^X\(\/\/\)[^/].*/{ s//\1/ q } /^X\(\/\/\)$/{ s//\1/ q } /^X\(\/\).*/{ s//\1/ q } s/.*/./; q'` srcdir=$ac_confdir if test ! -r "$srcdir/$ac_unique_file"; then srcdir=.. fi else ac_srcdir_defaulted=no fi if test ! -r "$srcdir/$ac_unique_file"; then test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .." as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir" fi ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work" ac_abs_confdir=`( cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg" pwd)` # When building in place, set srcdir=. if test "$ac_abs_confdir" = "$ac_pwd"; then srcdir=. fi # Remove unnecessary trailing slashes from srcdir. # Double slashes in file names in object file debugging info # mess up M-x gdb in Emacs. case $srcdir in */) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;; esac for ac_var in $ac_precious_vars; do eval ac_env_${ac_var}_set=\${${ac_var}+set} eval ac_env_${ac_var}_value=\$${ac_var} eval ac_cv_env_${ac_var}_set=\${${ac_var}+set} eval ac_cv_env_${ac_var}_value=\$${ac_var} done # # Report the --help message. # if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF \`configure' configures RandomFieldsUtils 1.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... To assign environment variables (e.g., CC, CFLAGS...), specify them as VAR=VALUE. See below for descriptions of some of the useful variables. Defaults for the options are specified in brackets. Configuration: -h, --help display this help and exit --help=short display options specific to this package --help=recursive display the short help of all the included packages -V, --version display version information and exit -q, --quiet, --silent do not print \`checking ...' messages --cache-file=FILE cache test results in FILE [disabled] -C, --config-cache alias for \`--cache-file=config.cache' -n, --no-create do not create output files --srcdir=DIR find the sources in DIR [configure dir or \`..'] Installation directories: --prefix=PREFIX install architecture-independent files in PREFIX [$ac_default_prefix] --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX [PREFIX] By default, \`make install' will install all the files in \`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify an installation prefix other than \`$ac_default_prefix' using \`--prefix', for instance \`--prefix=\$HOME'. For better control, use the options below. Fine tuning of the installation directories: --bindir=DIR user executables [EPREFIX/bin] --sbindir=DIR system admin executables [EPREFIX/sbin] --libexecdir=DIR program executables [EPREFIX/libexec] --sysconfdir=DIR read-only single-machine data [PREFIX/etc] --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] --localstatedir=DIR modifiable single-machine data [PREFIX/var] --runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run] --libdir=DIR object code libraries [EPREFIX/lib] --includedir=DIR C header files [PREFIX/include] --oldincludedir=DIR C header files for non-gcc [/usr/include] --datarootdir=DIR read-only arch.-independent data root [PREFIX/share] --datadir=DIR read-only architecture-independent data [DATAROOTDIR] --infodir=DIR info documentation [DATAROOTDIR/info] --localedir=DIR locale-dependent data [DATAROOTDIR/locale] --mandir=DIR man documentation [DATAROOTDIR/man] --docdir=DIR documentation root [DATAROOTDIR/doc/randomfieldsutils] --htmldir=DIR html documentation [DOCDIR] --dvidir=DIR dvi documentation [DOCDIR] --pdfdir=DIR pdf documentation [DOCDIR] --psdir=DIR ps documentation [DOCDIR] _ACEOF cat <<\_ACEOF _ACEOF fi if test -n "$ac_init_help"; then case $ac_init_help in short | recursive ) echo "Configuration of RandomFieldsUtils 1.0:";; esac cat <<\_ACEOF Some influential environment variables: CXX C++ compiler command CXXFLAGS C++ compiler flags LDFLAGS linker flags, e.g. -L if you have libraries in a nonstandard directory LIBS libraries to pass to the linker, e.g. -l CPPFLAGS (Objective) C/C++ preprocessor flags, e.g. -I if you have headers in a nonstandard directory Use these variables to override the choices made by `configure' or to help it to find libraries and programs with nonstandard names/locations. Report bugs to the package provider. _ACEOF ac_status=$? fi if test "$ac_init_help" = "recursive"; then # If there are subdirs, report their specific --help. for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue test -d "$ac_dir" || { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } || continue ac_builddir=. case "$ac_dir" in .) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; *) ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` # A ".." for each directory in $ac_dir_suffix. ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` case $ac_top_builddir_sub in "") ac_top_builddir_sub=. ac_top_build_prefix= ;; *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; esac ;; esac ac_abs_top_builddir=$ac_pwd ac_abs_builddir=$ac_pwd$ac_dir_suffix # for backward compatibility: ac_top_builddir=$ac_top_build_prefix case $srcdir in .) # We are building in place. ac_srcdir=. ac_top_srcdir=$ac_top_builddir_sub ac_abs_top_srcdir=$ac_pwd ;; [\\/]* | ?:[\\/]* ) # Absolute name. ac_srcdir=$srcdir$ac_dir_suffix; ac_top_srcdir=$srcdir ac_abs_top_srcdir=$srcdir ;; *) # Relative name. ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix ac_top_srcdir=$ac_top_build_prefix$srcdir ac_abs_top_srcdir=$ac_pwd/$srcdir ;; esac ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix cd "$ac_dir" || { ac_status=$?; continue; } # Check for guested configure. if test -f "$ac_srcdir/configure.gnu"; then echo && $SHELL "$ac_srcdir/configure.gnu" --help=recursive elif test -f "$ac_srcdir/configure"; then echo && $SHELL "$ac_srcdir/configure" --help=recursive else $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2 fi || ac_status=$? cd "$ac_pwd" || { ac_status=$?; break; } done fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF RandomFieldsUtils configure 1.0 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. This configure script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it. _ACEOF exit fi ## ------------------------ ## ## Autoconf initialization. ## ## ------------------------ ## # ac_fn_cxx_try_compile LINENO # ---------------------------- # Try to compile conftest.$ac_ext, and return whether this succeeded. ac_fn_cxx_try_compile () { as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack rm -f conftest.$ac_objext if { { ac_try="$ac_compile" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" $as_echo "$ac_try_echo"; } >&5 (eval "$ac_compile") 2>conftest.err ac_status=$? if test -s conftest.err; then grep -v '^ *+' conftest.err >conftest.er1 cat conftest.er1 >&5 mv -f conftest.er1 conftest.err fi $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } && { test -z "$ac_cxx_werror_flag" || test ! -s conftest.err } && test -s conftest.$ac_objext; then : ac_retval=0 else $as_echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_retval=1 fi eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno as_fn_set_status $ac_retval } # ac_fn_cxx_try_compile cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. It was created by RandomFieldsUtils $as_me 1.0, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ _ACEOF exec 5>>config.log { cat <<_ASUNAME ## --------- ## ## Platform. ## ## --------- ## hostname = `(hostname || uname -n) 2>/dev/null | sed 1q` uname -m = `(uname -m) 2>/dev/null || echo unknown` uname -r = `(uname -r) 2>/dev/null || echo unknown` uname -s = `(uname -s) 2>/dev/null || echo unknown` uname -v = `(uname -v) 2>/dev/null || echo unknown` /usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown` /bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown` /bin/arch = `(/bin/arch) 2>/dev/null || echo unknown` /usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown` /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown` /usr/bin/hostinfo = `(/usr/bin/hostinfo) 2>/dev/null || echo unknown` /bin/machine = `(/bin/machine) 2>/dev/null || echo unknown` /usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown` /bin/universe = `(/bin/universe) 2>/dev/null || echo unknown` _ASUNAME as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. $as_echo "PATH: $as_dir" done IFS=$as_save_IFS } >&5 cat >&5 <<_ACEOF ## ----------- ## ## Core tests. ## ## ----------- ## _ACEOF # Keep a trace of the command line. # Strip out --no-create and --no-recursion so they do not pile up. # Strip out --silent because we don't want to record it for future runs. # Also quote any args containing shell meta-characters. # Make two passes to allow for proper duplicate-argument suppression. ac_configure_args= ac_configure_args0= ac_configure_args1= ac_must_keep_next=false for ac_pass in 1 2 do for ac_arg do case $ac_arg in -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;; -q | -quiet | --quiet | --quie | --qui | --qu | --q \ | -silent | --silent | --silen | --sile | --sil) continue ;; *\'*) ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; esac case $ac_pass in 1) as_fn_append ac_configure_args0 " '$ac_arg'" ;; 2) as_fn_append ac_configure_args1 " '$ac_arg'" if test $ac_must_keep_next = true; then ac_must_keep_next=false # Got value, back to normal. else case $ac_arg in *=* | --config-cache | -C | -disable-* | --disable-* \ | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \ | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \ | -with-* | --with-* | -without-* | --without-* | --x) case "$ac_configure_args0 " in "$ac_configure_args1"*" '$ac_arg' "* ) continue ;; esac ;; -* ) ac_must_keep_next=true ;; esac fi as_fn_append ac_configure_args " '$ac_arg'" ;; esac done done { ac_configure_args0=; unset ac_configure_args0;} { ac_configure_args1=; unset ac_configure_args1;} # When interrupted or exit'd, cleanup temporary files, and complete # config.log. We remove comments because anyway the quotes in there # would cause problems or look ugly. # WARNING: Use '\'' to represent an apostrophe within the trap. # WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug. trap 'exit_status=$? # Save into config.log some information that might help in debugging. { echo $as_echo "## ---------------- ## ## Cache variables. ## ## ---------------- ##" echo # The following way of writing the cache mishandles newlines in values, ( for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do eval ac_val=\$$ac_var case $ac_val in #( *${as_nl}*) case $ac_var in #( *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; esac case $ac_var in #( _ | IFS | as_nl) ;; #( BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( *) { eval $ac_var=; unset $ac_var;} ;; esac ;; esac done (set) 2>&1 | case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #( *${as_nl}ac_space=\ *) sed -n \ "s/'\''/'\''\\\\'\'''\''/g; s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p" ;; #( *) sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" ;; esac | sort ) echo $as_echo "## ----------------- ## ## Output variables. ## ## ----------------- ##" echo for ac_var in $ac_subst_vars do eval ac_val=\$$ac_var case $ac_val in *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; esac $as_echo "$ac_var='\''$ac_val'\''" done | sort echo if test -n "$ac_subst_files"; then $as_echo "## ------------------- ## ## File substitutions. ## ## ------------------- ##" echo for ac_var in $ac_subst_files do eval ac_val=\$$ac_var case $ac_val in *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; esac $as_echo "$ac_var='\''$ac_val'\''" done | sort echo fi if test -s confdefs.h; then $as_echo "## ----------- ## ## confdefs.h. ## ## ----------- ##" echo cat confdefs.h echo fi test "$ac_signal" != 0 && $as_echo "$as_me: caught signal $ac_signal" $as_echo "$as_me: exit $exit_status" } >&5 rm -f core *.core core.conftest.* && rm -f -r conftest* confdefs* conf$$* $ac_clean_files && exit $exit_status ' 0 for ac_signal in 1 2 13 15; do trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal done ac_signal=0 # confdefs.h avoids OS command line length limits that DEFS can exceed. rm -f -r conftest* confdefs.h $as_echo "/* confdefs.h */" > confdefs.h # Predefined preprocessor variables. cat >>confdefs.h <<_ACEOF #define PACKAGE_NAME "$PACKAGE_NAME" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_TARNAME "$PACKAGE_TARNAME" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_VERSION "$PACKAGE_VERSION" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_STRING "$PACKAGE_STRING" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_URL "$PACKAGE_URL" _ACEOF # Let the site file select an alternate cache file if it wants to. # Prefer an explicitly selected file to automatically selected ones. ac_site_file1=NONE ac_site_file2=NONE if test -n "$CONFIG_SITE"; then # We do not want a PATH search for config.site. case $CONFIG_SITE in #(( -*) ac_site_file1=./$CONFIG_SITE;; */*) ac_site_file1=$CONFIG_SITE;; *) ac_site_file1=./$CONFIG_SITE;; esac elif test "x$prefix" != xNONE; then ac_site_file1=$prefix/share/config.site ac_site_file2=$prefix/etc/config.site else ac_site_file1=$ac_default_prefix/share/config.site ac_site_file2=$ac_default_prefix/etc/config.site fi for ac_site_file in "$ac_site_file1" "$ac_site_file2" do test "x$ac_site_file" = xNONE && continue if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5 $as_echo "$as_me: loading site script $ac_site_file" >&6;} sed 's/^/| /' "$ac_site_file" >&5 . "$ac_site_file" \ || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error $? "failed to load site script $ac_site_file See \`config.log' for more details" "$LINENO" 5; } fi done if test -r "$cache_file"; then # Some versions of bash will fail to source /dev/null (special files # actually), so we avoid doing that. DJGPP emulates it as a regular file. if test /dev/null != "$cache_file" && test -f "$cache_file"; then { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5 $as_echo "$as_me: loading cache $cache_file" >&6;} case $cache_file in [\\/]* | ?:[\\/]* ) . "$cache_file";; *) . "./$cache_file";; esac fi else { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5 $as_echo "$as_me: creating cache $cache_file" >&6;} >$cache_file fi # Check that the precious variables saved in the cache have kept the same # value. ac_cache_corrupted=false for ac_var in $ac_precious_vars; do eval ac_old_set=\$ac_cv_env_${ac_var}_set eval ac_new_set=\$ac_env_${ac_var}_set eval ac_old_val=\$ac_cv_env_${ac_var}_value eval ac_new_val=\$ac_env_${ac_var}_value case $ac_old_set,$ac_new_set in set,) { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 $as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} ac_cache_corrupted=: ;; ,set) { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5 $as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} ac_cache_corrupted=: ;; ,);; *) if test "x$ac_old_val" != "x$ac_new_val"; then # differences in whitespace do not lead to failure. ac_old_val_w=`echo x $ac_old_val` ac_new_val_w=`echo x $ac_new_val` if test "$ac_old_val_w" != "$ac_new_val_w"; then { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5 $as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} ac_cache_corrupted=: else { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 $as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} eval $ac_var=\$ac_old_val fi { $as_echo "$as_me:${as_lineno-$LINENO}: former value: \`$ac_old_val'" >&5 $as_echo "$as_me: former value: \`$ac_old_val'" >&2;} { $as_echo "$as_me:${as_lineno-$LINENO}: current value: \`$ac_new_val'" >&5 $as_echo "$as_me: current value: \`$ac_new_val'" >&2;} fi;; esac # Pass precious variables to config.status. if test "$ac_new_set" = set; then case $ac_new_val in *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; *) ac_arg=$ac_var=$ac_new_val ;; esac case " $ac_configure_args " in *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. *) as_fn_append ac_configure_args " '$ac_arg'" ;; esac fi done if $ac_cache_corrupted; then { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;} { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5 $as_echo "$as_me: error: changes in the environment can compromise the build" >&2;} as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5 fi ## -------------------- ## ## Main body of script. ## ## -------------------- ## ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu ################################################## ## debugging options ################################################## #CROSS="arm32" #CROSS="avx2" #CROSS="nosimd" #CROSS="noflags" #CROSS="FALSE" #CROSS="TRUE" #CROSS="NA" #USE_GPU="try" #USE_GPU="yes" #CXX_FLAGS="-march=native" #CXX_FLAGS="-nonsense" #USERASKED="TRUE" #USERASKED="FALSE" #MEM_IS_ALIGNED="TRUE" #MEM_IS_ALIGNED="FALSE" ################################################## ## explicit options passed by RandomFieldsUtils ################################################## ## CXX_FLAGS (including omp) ## CROSS ## SIMD_FLAGS # superset of what is needed and recognized ## USE_GPU ## CUDA_HOME ## USERASKED ## MEM_IS_ALIGNED ################################################## ## package specific definitions ################################################## #MY_SSE2 #MY_SSE3 #MY_SSSE3 #MY_SSE41 #MY_AVX #MY_AVX2 #MY_AVX512F #MY_MAX_SSE2 #MY_MAX_SSE3 #MY_MAX_SSSE3 #MY_MAX_SSE41 #MY_MAX_AVX #MY_MAX_AVX2 #MY_MAX_AVX512F #MY_CU_FILES #MY_C_FILES ################################################## ## RandomFieldsUtils ################################################## ##CROSS= MY_SSE2="" MY_SSE3="" MY_SSSE3="" MY_SSE41="" MY_AVX="avx_fctns.o" MY_AVX2="avx2_fctns.o" MY_AVX512F="" MY_CU_FILES="solve_61.o gpu_info_61.o" MY_C_FILES="AutoRandomFieldsUtils.o beskf.o brdomain.o extern.o kleinkram.o maths.o options.o RFoptions.o solve.o sort.o sortLong.o utils.o win_linux_aux.o xport_import.o zzz.o gpu_info.o bckslvmodified.o cholmodified.o spamown.o obsolete.o" ################################################## ## general part ################################################## CXX=`"${R_HOME}/bin/R" CMD config CXX` ac_ext=cpp ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_cxx_compiler_gnu if test -z "$CXX"; then if test -n "$CCC"; then CXX=$CCC else if test -n "$ac_tool_prefix"; then for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC do # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. set dummy $ac_tool_prefix$ac_prog; ac_word=$2 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 $as_echo_n "checking for $ac_word... " >&6; } if ${ac_cv_prog_CXX+:} false; then : $as_echo_n "(cached) " >&6 else if test -n "$CXX"; then ac_cv_prog_CXX="$CXX" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_CXX="$ac_tool_prefix$ac_prog" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done IFS=$as_save_IFS fi fi CXX=$ac_cv_prog_CXX if test -n "$CXX"; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5 $as_echo "$CXX" >&6; } else { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 $as_echo "no" >&6; } fi test -n "$CXX" && break done fi if test -z "$CXX"; then ac_ct_CXX=$CXX for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 $as_echo_n "checking for $ac_word... " >&6; } if ${ac_cv_prog_ac_ct_CXX+:} false; then : $as_echo_n "(cached) " >&6 else if test -n "$ac_ct_CXX"; then ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CXX="$ac_prog" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done IFS=$as_save_IFS fi fi ac_ct_CXX=$ac_cv_prog_ac_ct_CXX if test -n "$ac_ct_CXX"; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5 $as_echo "$ac_ct_CXX" >&6; } else { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 $as_echo "no" >&6; } fi test -n "$ac_ct_CXX" && break done if test "x$ac_ct_CXX" = x; then CXX="g++" else case $cross_compiling:$ac_tool_warned in yes:) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} ac_tool_warned=yes ;; esac CXX=$ac_ct_CXX fi fi fi fi # Provide some information about the compiler. $as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5 set X $ac_compile ac_compiler=$2 for ac_option in --version -v -V -qversion; do { { ac_try="$ac_compiler $ac_option >&5" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" $as_echo "$ac_try_echo"; } >&5 (eval "$ac_compiler $ac_option >&5") 2>conftest.err ac_status=$? if test -s conftest.err; then sed '10a\ ... rest of stderr output deleted ... 10q' conftest.err >conftest.er1 cat conftest.er1 >&5 fi rm -f conftest.er1 conftest.err $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } done cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int main () { ; return 0; } _ACEOF ac_clean_files_save=$ac_clean_files ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out" # Try to create an executable without -o first, disregard a.out. # It will help us diagnose broken compilers, and finding out an intuition # of exeext. { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C++ compiler works" >&5 $as_echo_n "checking whether the C++ compiler works... " >&6; } ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'` # The possible output files: ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*" ac_rmfiles= for ac_file in $ac_files do case $ac_file in *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; * ) ac_rmfiles="$ac_rmfiles $ac_file";; esac done rm -f $ac_rmfiles if { { ac_try="$ac_link_default" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" $as_echo "$ac_try_echo"; } >&5 (eval "$ac_link_default") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then : # Autoconf-2.13 could set the ac_cv_exeext variable to `no'. # So ignore a value of `no', otherwise this would lead to `EXEEXT = no' # in a Makefile. We should not override ac_cv_exeext if it was cached, # so that the user can short-circuit this test for compilers unknown to # Autoconf. for ac_file in $ac_files '' do test -f "$ac_file" || continue case $ac_file in *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; [ab].out ) # We found the default executable, but exeext='' is most # certainly right. break;; *.* ) if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no; then :; else ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` fi # We set ac_cv_exeext here because the later test for it is not # safe: cross compilers may not add the suffix if given an `-o' # argument, so we may need to know it at that point already. # Even if this section looks crufty: it has the advantage of # actually working. break;; * ) break;; esac done test "$ac_cv_exeext" = no && ac_cv_exeext= else ac_file='' fi if test -z "$ac_file"; then : { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 $as_echo "no" >&6; } $as_echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error 77 "C++ compiler cannot create executables See \`config.log' for more details" "$LINENO" 5; } else { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 $as_echo "yes" >&6; } fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler default output file name" >&5 $as_echo_n "checking for C++ compiler default output file name... " >&6; } { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5 $as_echo "$ac_file" >&6; } ac_exeext=$ac_cv_exeext rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out ac_clean_files=$ac_clean_files_save { $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5 $as_echo_n "checking for suffix of executables... " >&6; } if { { ac_try="$ac_link" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" $as_echo "$ac_try_echo"; } >&5 (eval "$ac_link") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then : # If both `conftest.exe' and `conftest' are `present' (well, observable) # catch `conftest.exe'. For instance with Cygwin, `ls conftest' will # work properly (i.e., refer to `conftest.exe'), while it won't with # `rm'. for ac_file in conftest.exe conftest conftest.*; do test -f "$ac_file" || continue case $ac_file in *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` break;; * ) break;; esac done else { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error $? "cannot compute suffix of executables: cannot compile and link See \`config.log' for more details" "$LINENO" 5; } fi rm -f conftest conftest$ac_cv_exeext { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5 $as_echo "$ac_cv_exeext" >&6; } rm -f conftest.$ac_ext EXEEXT=$ac_cv_exeext ac_exeext=$EXEEXT cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include int main () { FILE *f = fopen ("conftest.out", "w"); return ferror (f) || fclose (f) != 0; ; return 0; } _ACEOF ac_clean_files="$ac_clean_files conftest.out" # Check that the compiler produces executables we can run. If not, either # the compiler is broken, or we cross compile. { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5 $as_echo_n "checking whether we are cross compiling... " >&6; } if test "$cross_compiling" != yes; then { { ac_try="$ac_link" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" $as_echo "$ac_try_echo"; } >&5 (eval "$ac_link") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } if { ac_try='./conftest$ac_cv_exeext' { { case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" $as_echo "$ac_try_echo"; } >&5 (eval "$ac_try") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; }; then cross_compiling=no else if test "$cross_compiling" = maybe; then cross_compiling=yes else { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error $? "cannot run C++ compiled programs. If you meant to cross compile, use \`--host'. See \`config.log' for more details" "$LINENO" 5; } fi fi fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5 $as_echo "$cross_compiling" >&6; } rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out ac_clean_files=$ac_clean_files_save { $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5 $as_echo_n "checking for suffix of object files... " >&6; } if ${ac_cv_objext+:} false; then : $as_echo_n "(cached) " >&6 else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int main () { ; return 0; } _ACEOF rm -f conftest.o conftest.obj if { { ac_try="$ac_compile" case "(($ac_try" in *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; *) ac_try_echo=$ac_try;; esac eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" $as_echo "$ac_try_echo"; } >&5 (eval "$ac_compile") 2>&5 ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; then : for ac_file in conftest.o conftest.obj conftest.*; do test -f "$ac_file" || continue; case $ac_file in *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;; *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'` break;; esac done else $as_echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;} as_fn_error $? "cannot compute suffix of object files: cannot compile See \`config.log' for more details" "$LINENO" 5; } fi rm -f conftest.$ac_cv_objext conftest.$ac_ext fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5 $as_echo "$ac_cv_objext" >&6; } OBJEXT=$ac_cv_objext ac_objext=$OBJEXT { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5 $as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; } if ${ac_cv_cxx_compiler_gnu+:} false; then : $as_echo_n "(cached) " >&6 else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int main () { #ifndef __GNUC__ choke me #endif ; return 0; } _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : ac_compiler_gnu=yes else ac_compiler_gnu=no fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext ac_cv_cxx_compiler_gnu=$ac_compiler_gnu fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5 $as_echo "$ac_cv_cxx_compiler_gnu" >&6; } if test $ac_compiler_gnu = yes; then GXX=yes else GXX= fi ac_test_CXXFLAGS=${CXXFLAGS+set} ac_save_CXXFLAGS=$CXXFLAGS { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5 $as_echo_n "checking whether $CXX accepts -g... " >&6; } if ${ac_cv_prog_cxx_g+:} false; then : $as_echo_n "(cached) " >&6 else ac_save_cxx_werror_flag=$ac_cxx_werror_flag ac_cxx_werror_flag=yes ac_cv_prog_cxx_g=no CXXFLAGS="-g" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int main () { ; return 0; } _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : ac_cv_prog_cxx_g=yes else CXXFLAGS="" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int main () { ; return 0; } _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : else ac_cxx_werror_flag=$ac_save_cxx_werror_flag CXXFLAGS="-g" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ int main () { ; return 0; } _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : ac_cv_prog_cxx_g=yes fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext ac_cxx_werror_flag=$ac_save_cxx_werror_flag fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5 $as_echo "$ac_cv_prog_cxx_g" >&6; } if test "$ac_test_CXXFLAGS" = set; then CXXFLAGS=$ac_save_CXXFLAGS elif test $ac_cv_prog_cxx_g = yes; then if test "$GXX" = yes; then CXXFLAGS="-g -O2" else CXXFLAGS="-g" fi else if test "$GXX" = yes; then CXXFLAGS="-O2" else CXXFLAGS= fi fi ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu ac_ext=cpp ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_cxx_compiler_gnu SAVE_CXXFLAGS="$CXX_FLAGS" MY_CUDA_HOME="${CUDA_HOME}" if test "x${CROSS}" == x ; then CROSS=noflags elif test "x${CROSS}" == xTRUE || test "x${CROSS}" == xFALSE || test "x${CROSS}" == xNA; then CROSS_BOOL=yes fi { $as_echo "$as_me:${as_lineno-$LINENO}: value of 'CROSS' is '${CROSS}'." >&5 $as_echo "$as_me: value of 'CROSS' is '${CROSS}'." >&6;} if test "x${USERASKED}" == x ; then USERASKED_FLAG="" elif test "x${USERASKED}" == xTRUE ; then USERASKED_FLAG="-DUSERASKED=true" else USERASKED_FLAG="-DUSERASKED=false" fi #AC_MSG_NOTICE([value of 'MEM_IS_ALIGNED' is '${MEM_IS_ALIGNED}'.]) if test "x${MEM_IS_ALIGNED}" == x ; then MEM_IS_ALIGNED_FLAG="-DMEMisALIGNED=Nan" elif test "x${MEM_IS_ALIGNED}" == xTRUE ; then MEM_IS_ALIGNED_FLAG="-DMEMisALIGNED=True -DMEM_IS_ALIGNED" else MEM_IS_ALIGNED_FLAG="-DMEMisALIGNED=False" fi ###################################################################### ## availability of run-time checks ###################################################################### #include ; # int main(){int B=1, s[[4]];__cpuid(s, B);} ]])] CPUID_FLAG="" { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether __cpuid is available" >&5 $as_echo_n "checking whether __cpuid is available... " >&6; } cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include #include int main(){int B=1, s[4]; __cpuid(s, B);} _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : CPUID_FLAG="-DWINCPUID" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext if test "x${CPUID_FLAG}" != x ; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: via intrin.h" >&5 $as_echo "via intrin.h" >&6; } else { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 $as_echo "no" >&6; } { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether cpuid works under asm" >&5 $as_echo_n "checking whether cpuid works under asm... " >&6; } cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include #include int main(){unsigned B; uint32_t s[4]; asm volatile ("cpuid": "=a"(s[0]), "=b"(s[1]),"=c"(s[2]), "=d"(s[3]):"a"(B),"c"(0)); } _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : CPUID_FLAG="-DLINUXCPUID" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext if test "x${CPUID_FLAG}" != x ; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 $as_echo "yes" >&6; } else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include #include int main(){uint32_t a,b,c,d,level=0; __cpuid(level, a, b, c, d); } _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : CPUID_FLAG="-DMINGWCPUID" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext if test "x${CPUID_FLAG}" != x ; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: via cpuid.h" >&5 $as_echo "via cpuid.h" >&6; } else { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 $as_echo "no" >&6; } fi fi fi ###################################################################### ## cuda ###################################################################### # Details of CUDA compilation MY_CUDA_LIBS="" #MY_CUDA="" GPU_FLAG=" -DGPU_NEEDS=Inone" if test "x${MY_CU_FILES}" != x && test "x${CPUID_FLAG}" != "x-DWINCPUID" ; then GPU_FLAG=" -DGPU_NEEDS=Igpu" if test "x${USE_GPU}" != x ; then ## both cuda and graphics card available? { $as_echo "$as_me:${as_lineno-$LINENO}: value of 'USE_GPU' is '${USE_GPU}'." >&5 $as_echo "$as_me: value of 'USE_GPU' is '${USE_GPU}'." >&6;} { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether CUDA_HOME is set" >&5 $as_echo_n "checking whether CUDA_HOME is set... " >&6; } if test -z "${MY_CUDA_HOME}"; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: no -- using /usr/local/cuda" >&5 $as_echo "no -- using /usr/local/cuda" >&6; } MY_CUDA_HOME="/usr/local/cuda" else { $as_echo "$as_me:${as_lineno-$LINENO}: result: using CUDA_HOME=${MY_CUDA_HOME}" >&5 $as_echo "using CUDA_HOME=${MY_CUDA_HOME}" >&6; } fi as_ac_File=`$as_echo "ac_cv_file_${MY_CUDA_HOME}/bin/nvcc" | $as_tr_sh` { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ${MY_CUDA_HOME}/bin/nvcc" >&5 $as_echo_n "checking for ${MY_CUDA_HOME}/bin/nvcc... " >&6; } if eval \${$as_ac_File+:} false; then : $as_echo_n "(cached) " >&6 else test "$cross_compiling" = yes && as_fn_error $? "cannot check for file existence when cross compiling" "$LINENO" 5 if test -r "${MY_CUDA_HOME}/bin/nvcc"; then eval "$as_ac_File=yes" else eval "$as_ac_File=no" fi fi eval ac_res=\$$as_ac_File { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 $as_echo "$ac_res" >&6; } if eval test \"x\$"$as_ac_File"\" = x"yes"; then : CUDA_INSTALLED=yes else CUDA_INSTALLED=no fi if test ${CUDA_INSTALLED} == yes ; then { $as_echo "$as_me:${as_lineno-$LINENO}: ${CUDA_INSTALLED}" >&5 $as_echo "$as_me: ${CUDA_INSTALLED}" >&6;} as_ac_File=`$as_echo "ac_cv_file_${MY_CUDA_HOME}/lib64/libcublas.so" | $as_tr_sh` { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ${MY_CUDA_HOME}/lib64/libcublas.so" >&5 $as_echo_n "checking for ${MY_CUDA_HOME}/lib64/libcublas.so... " >&6; } if eval \${$as_ac_File+:} false; then : $as_echo_n "(cached) " >&6 else test "$cross_compiling" = yes && as_fn_error $? "cannot check for file existence when cross compiling" "$LINENO" 5 if test -r "${MY_CUDA_HOME}/lib64/libcublas.so"; then eval "$as_ac_File=yes" else eval "$as_ac_File=no" fi fi eval ac_res=\$$as_ac_File { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 $as_echo "$ac_res" >&6; } if eval test \"x\$"$as_ac_File"\" = x"yes"; then : else as_fn_error $? "this package only works with 64 bit installations of CUDA" "$LINENO" 5 fi GPU_FLAG="-DGPU_NEEDS=Inone -DUSEGPU" MY_CUDA_LIBS="-L${MY_CUDA_HOME}/lib64 -lcudart -lcublas -lcusolver -lcusolverMg" else if test "x${USE_GPU}" == xtry ; then MY_CU_FILES="" else as_fn_error $? "No CUDA installation found, install CUDA or specify CUDA_HOME." "$LINENO" 5 fi fi else { $as_echo "$as_me:${as_lineno-$LINENO}: 'USE_GPU' has not been set." >&5 $as_echo "$as_me: 'USE_GPU' has not been set." >&6;} MY_CU_FILES="" fi fi ###################################################################### ### SIMD ###################################################################### #AC_MSG_NOTICE([simdflags ${SIMD_FLAGS}]) if test "x${SIMD_FLAGS}" == x ; then { $as_echo "$as_me:${as_lineno-$LINENO}: checking SIMD options for some CC files" >&5 $as_echo_n "checking SIMD options for some CC files... " >&6; } ## if test "x${MY_MAX_AVX}" != x || test "x${MY_MAX_AVX2}" != x || test "x${MY_MAX_AVX512F}" != x ; then ANY_MAX_AVX=yes fi if test "x${MY_SSE2}" != x || test "x${MY_MAX_SSE2}" != x || test "x${MY_MAX_SSE3}" != x || test "x${MY_MAX_SSSE3}" != x || test "x${MY_MAX_SSE41}" != x || test "x${ANY_MAX_AVX}" == xyes ; then SIMD_FLAGS="sse2 $SIMD_FLAGS" fi if test "x${MY_SSE3}" != x || test "x${MY_MAX_SSE3}" != x || test "x${MY_MAX_SSSE3}" != x || test "x${MY_MAX_SSE41}" != x || test "x${ANY_MAX_AVX}" == xyes ; then SIMD_FLAGS="sse3 $SIMD_FLAGS" fi if test "x${MY_SSSE3}" != x || test "x${MY_MAX_SSSE3}" != x || test "x${MY_MAX_SSE41}" != x || test "x${ANY_MAX_AVX}" == xyes ; then SIMD_FLAGS="ssse3 $SIMD_FLAGS" fi if test "x${MY_SSE41}" != x || test "x${MY_MAX_SSE41}" != x || test "x${ANY_MAX_AVX}" == xyes ; then SIMD_FLAGS="sse41 $SIMD_FLAGS" fi if test "x${MY_AVX}" != x || test "x${ANY_MAX_AVX}" == xyes ; then SIMD_FLAGS="avx $SIMD_FLAGS" fi if test "x${MY_AVX2}" != x || test "x${MY_MAX_AVX2}" != x || test "x${MY_MAX_AVX512F}" != x; then SIMD_FLAGS="avx2 $SIMD_FLAGS" fi if test "x${MY_AVX512F}" != x || test "x${MY_MAX_AVX512F}" != x ; then SIMD_FLAGS="avx512f $SIMD_FLAGS" fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${SIMD_FLAGS}" >&5 $as_echo "${SIMD_FLAGS}" >&6; } fi #AC_MSG_NOTICE([simdflags ${SIMD_FLAGS}]) ## which of SIMD_FLAGS are recognized? if test "x${CROSS_BOOL}" == xyes ; then ## sysctl -a | grep "cpu.features:" # for OS X # Extract the first word of "lscpu", so it can be a program name with args. set dummy lscpu; ac_word=$2 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 $as_echo_n "checking for $ac_word... " >&6; } if ${ac_cv_prog_LSCPU+:} false; then : $as_echo_n "(cached) " >&6 else if test -n "$LSCPU"; then ac_cv_prog_LSCPU="$LSCPU" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_LSCPU="yes" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done IFS=$as_save_IFS test -z "$ac_cv_prog_LSCPU" && ac_cv_prog_LSCPU="no" fi fi LSCPU=$ac_cv_prog_LSCPU if test -n "$LSCPU"; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LSCPU" >&5 $as_echo "$LSCPU" >&6; } else { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 $as_echo "no" >&6; } fi if test "x${LSCPU}" == xyes ; then INFO_CPU=`lscpu | grep Flags | tr "[:upper:]" "[:lower:]"` else # Extract the first word of "sysctl", so it can be a program name with args. set dummy sysctl; ac_word=$2 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 $as_echo_n "checking for $ac_word... " >&6; } if ${ac_cv_prog_SYSCTL+:} false; then : $as_echo_n "(cached) " >&6 else if test -n "$SYSCTL"; then ac_cv_prog_SYSCTL="$SYSCTL" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_SYSCTL="yes" $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done IFS=$as_save_IFS test -z "$ac_cv_prog_SYSCTL" && ac_cv_prog_SYSCTL="no" fi fi SYSCTL=$ac_cv_prog_SYSCTL if test -n "$SYSCTL"; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SYSCTL" >&5 $as_echo "$SYSCTL" >&6; } else { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 $as_echo "no" >&6; } fi if test "x${SYSCTL}" == xyes ; then INFO_CPU=`sysctl -a 2> /dev/null | grep machdep.cpu.features | tr "[:upper:]" "[:lower:]"` fi fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking which SIMD flags can be recognized easily" >&5 $as_echo_n "checking which SIMD flags can be recognized easily... " >&6; } TMP=${SIMD_FLAGS} SIMD_FLAGS="" for SET in ${TMP} ; do CXXFLAGS="$SAVE_CXXFLAGS -m${SET}" ## name is obligatory info=`echo "${INFO_CPU}" | grep " $SET "` if test "x${info}" != x ; then cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : SIMD_FLAGS="${SIMD_FLAGS} $SET" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext fi done if test "x${SIMD_FLAGS}" == x ; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5 $as_echo "none" >&6; } else { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${SIMD_FLAGS}" >&5 $as_echo "${SIMD_FLAGS}" >&6; } USE_AVX="yes" fi elif test "x${CROSS}" != xnosimd && test "x${CROSS}" != xnoflags ; then USE_AVX="yes" fi #AC_MSG_NOTICE([simdflags ${SIMD_FLAGS}]) # which counterpart "-mno-xxx" exists? NOT_EQUAL_OR_HIGHER_FLAG="" { $as_echo "$as_me:${as_lineno-$LINENO}: checking which downwards controls might be used" >&5 $as_echo_n "checking which downwards controls might be used... " >&6; } if test "x${CROSS}" == xnoflags ; then CXXFLAGS="" elif test "x${CROSS}" == xarm32 ; then CXXFLAGS="-mfpu=neon -funsafe-math-optimizations" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : CROSS_FLAG="${CXXFLAGS}" else CROSS_FLAG="" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext else CXXFLAGS="-mno-sse2" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : NOT_SSE2="${CXXFLAGS}" else CXXFLAGS="" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext fi if test "x${USE_AVX}" != xyes ; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${NOT_SSE2}." >&5 $as_echo "${NOT_SSE2}." >&6; } else if test "x${CROSS}" != xmmx ; then DO_SSE2="-msse2" CXXFLAGS="-mno-sse3" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : NOT_SSE3="${CXXFLAGS}" else CXXFLAGS="" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext if test "x${CROSS}" != xsse2 ; then DO_SSE3="-msse3" CXXFLAGS="-mno-ssse3" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : NOT_SSSE3="${CXXFLAGS}" else CXXFLAGS="" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext if test "x${CROSS}" != xsse3 ; then DO_SSSE3="-mssse3" CXXFLAGS="-mno-sse4.1" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : NOT_SSE41="${CXXFLAGS}" else CXXFLAGS="" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext if test "x${CROSS}" != xssse3 ; then DO_SSE41="-msse4.1" CXXFLAGS="-mno-avx" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : NOT_AVX="${CXXFLAGS}" else CXXFLAGS="" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext if test "x${CROSS}" != xsse41 ; then DO_AVX="-mavx" CXXFLAGS="-mno-avx2" cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : NOT_AVX2="${CXXFLAGS}" else CXXFLAGS="" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext if test "x${CROSS}" != xavx ; then DO_AVX2="-mavx2" CXXFLAGS="-mno-avx512f" ## name is obligatory cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : NOT_AVX512F="${CXXFLAGS}" else CXXFLAGS="" fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext if test "x${CROSS}" != xavx2 ; then DO_AVX512F="-mavx512f" if test "x${CROSS_BOOL}" == xyes || test "x${CROSS}" == xavx512f ; then CXXFLAGS="" else as_fn_error $? "unrecognized CROSS option '${CROSS}'" "$LINENO" 5 fi fi fi fi fi fi fi fi if test "x${CROSS_BOOL}" != xyes ; then ## and USE_AVX=yes NOT_EQUAL_OR_HIGHER_FLAG="${CXXFLAGS}" fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${NOT_AVX512F} ${NOT_AVX2} ${NOT_AVX} ${NOT_SSE41} ${NOT_SSSE3} ${NOT_SSE3} ${NOT_SSE2}." >&5 $as_echo "${NOT_AVX512F} ${NOT_AVX2} ${NOT_AVX} ${NOT_SSE41} ${NOT_SSSE3} ${NOT_SSE3} ${NOT_SSE2}." >&6; } for SET in ${SIMD_FLAGS} ; do ## set MY_xxx_FLAGS sharp to xxx (and nothing higher, ## if supported by compiler) CXXFLAGS="$SAVE_CXXFLAGS -m$SET" ## name is obligatory flag_test=0 ## test only necesary if CROSS not in {TRUE, FALSE, nosimd}, ## but performed also cases except CROSS=nosimd cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : flag_test=1 else flag_test=0 fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext if test $flag_test == 0 ; then as_fn_error $? "compilation failure for '$SAVE_CXXFLAGS -m$SET'" "$LINENO" 5 else if test "${SET}" == sse2 ; then SSE2_FLAGS="${DO_SSE2} ${NOT_SSE3}" elif test "${SET}" == sse3 ; then SSE3_FLAGS="${DO_SSE3} ${NOT_SSSE3}" elif test "${SET}" == ssse3 ; then SSSE3_FLAGS="${DO_SSSE3} ${NOT_SSE41}" elif test "${SET}" == sse41 ; then SSE41_FLAGS="${DO_SSE41} ${NOT_AVX}" elif test "${SET}" == avx ; then AVX_FLAGS="${DO_AVX} ${NOT_AVX2}" elif test "${SET}" == avx2 ; then AVX2_FLAGS="${DO_AVX2} ${NOT_AVX512F}" elif test "${SET}" == avx512f ; then AVX512F_FLAGS="${DO_AVX512F} " else EXOTIC_SIMD_FLAGS="-m${SET} ${EXOTIC_SIMD_FLAGS}" # iteratively called fi fi done fi ## add the list of MY_MAX_xxx files to list of MY_xxx files if test "x${MY_MAX_AVX512F}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_AVX512F}" fi if test "x${DO_AVX512F}" != x && test "x${MAX_FILES}" != x ; then MY_AVX512F="${MAX_FILES} ${MY_AVX512F}" MAX_FILES="" fi if test "x${MY_MAX_AVX2}" != x ; then MAX_FILES="${MY_MAX_AVX2} ${MAX_FILES}" fi if test "x${DO_AVX2}" != x && test "x${MAX_FILES}" != x ; then MY_AVX2="${MAX_FILES} ${MY_AVX2}" MAX_FILES="" fi if test "x${MY_MAX_AVX}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_AVX}" fi if test "x${DO_AVX}" != x && test "x${MAX_FILES}" != x ; then MY_AVX="${MAX_FILES} ${MY_AVX}" MAX_FILES="" fi if test "x${MY_MAX_SSE41}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_SSE41}" fi if test "x${DO_SSE41}" != x && test "x${MAX_FILES}" != x ; then MY_SSE41="${MAX_FILES} ${MY_SSE41}" MAX_FILES="" fi if test "x${MY_MAX_SSSE3}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_SSSE3}" fi if test "x${DO_SSSE3}" != x && test "x${MAX_FILES}" != x ; then MY_SSSE3="${MAX_FILES} ${MY_SSSE3}" MAX_FILES="" fi if test "x${MY_MAX_SSE3}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_SSE3}" fi if test "x${DO_SSE3}" != x && test "x${MAX_FILES}" != x ; then MY_SSE3="${MAX_FILES} ${MY_SSE3}" MAX_FILES="" fi if test "x${MY_MAX_SSE2}" != x ; then MAX_FILES="${MAX_FILES} ${MY_MAX_SSE2}" fi if test "x${DO_SSE2}" != x && test "x${MAX_FILES}" != x ; then MY_SSE2="${MAX_FILES} ${MY_SSE2}" MAX_FILES="" fi #AC_MSG_NOTICE([XXXuseavx=${USEAVX} less than ${NOT_EQUAL_OR_HIGHER_FLAG}; server=${CROSS}]) ## determine the SIMD upper bound in case of CROSS=TRUE if test "x${USE_AVX}" != xyes ; then if test "x$CROSS" != xFALSE && test "x$CROSS" != xnoflags ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_SSE2}" fi elif test "x$CROSS" == xTRUE ; then if test "x${MY_AVX512F}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="" elif test "x${MY_AVX2}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_AVX512F}" elif test "x${MY_AVX}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_AVX2}" elif test "x${MY_SSE41}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_AVX}" elif test "x${MY_SSSE3}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_SSE41}" elif test "x${MY_SSE3}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_SSSE3}" elif test "x${MY_SSE2}" != x ; then NOT_EQUAL_OR_HIGHER_FLAG="${NOT_SSE3}" else NOT_EQUAL_OR_HIGHER_FLAG="${NOT_SSE2}" fi fi #AC_MSG_NOTICE([useavx=${USEAVX} less than ${NOT_EQUAL_OR_HIGHER_FLAG}; server=${CROSS}]) ## prepare CROSS flags for Makevars.in if test "x$CROSS" == xTRUE ; then CROSS_FLAG="-DREQUIRED_SIMD -DCROSS_CAPACITY=${NOT_EQUAL_OR_HIGHER_FLAG}" elif test "x$CROSS" == xFALSE ; then CROSS_FLAG="-DREQUIRED_SIMD=3" elif test "x$CROSS" == xNA ; then CROSS_FLAG="-DREQUIRED_SIMD=2" elif test "x$CROSS" == xnoflags ; then CROSS_FLAG="" elif test "x$CROSS" == xarm32 ; then CROSS_FLAG="${CROSS_FLAG} -DREQUIRED_SIMD=4" ## higher 3 reservered for ARM elif test "x$CROSS" != xnosimd ; then CROSS_FLAG="-DCROSS_CAPACITY=${CROSS}" elif test "x${NOT_SSE2}" == x ; then # && CROSS = nosimd CROSS_FLAG="-DREQUIRED_SIMD=0" else ## CROSS = nosimd, no explicit limitation possible CROSS_FLAG="-DREQUIRED_SIMD=1" fi OMP="\$(SHLIB_OPENMP_CXXFLAGS)" MY_PKG_FLAGS="${SAVE_CXXFLAGS} ${CPUID_FLAG} ${GPU_FLAG} ${EXOTIC_SIMD_FLAGS} ${NOT_EQUAL_OR_HIGHER_FLAG} ${CROSS_FLAG} ${MEM_IS_ALIGNED_FLAG} ${USERASKED_FLAG}" MY_C_FILES="$MY_C_FILES $MY_SSE2 $MY_SSE3 $MY_SSSE3 $MY_SSE41 $MY_AVX $MY_AVX2 $MY_AVX512F $MAX_FILES" MY_LIB_FLAGS="$LIB_FLAGS ${OMP}" TMP="PKG_CXXFLAGS = ${MY_PKG_FLAGS} ${OMP} " # { $as_echo "$as_me:${as_lineno-$LINENO}: default compilation option is ${MY_PKG_FLAGS}" >&5 $as_echo "$as_me: default compilation option is ${MY_PKG_FLAGS}" >&6;} ## prepare MY_xxx for Makevars.in if test "x${USE_AVX}" == xyes ; then if test "x${MY_SSE2}" != x ; then MY_SSE2="${MY_SSE2}: ${TMP} ${SSE2_FLAGS}" fi if test "x${MY_SSE3}" != x ; then MY_SSE3="${MY_SSE3}: ${TMP} ${SSE3_FLAGS}" fi if test "x${MY_SSSE3}" != x ; then MY_SSSE3="${MY_SSSE3}: ${TMP} ${SSSE3_FLAGS}" fi if test "x${MY_SSE41}" != x ; then MY_SSE41="${MY_SSE41}: ${TMP} ${SE41_FLAGS}" fi if test "x${MY_AVX}" != x ; then MY_AVX="${MY_AVX}: ${TMP} ${AVX_FLAGS}" fi if test "x${MY_AVX2}" != x ; then MY_AVX2="${MY_AVX2}: ${TMP} ${AVX2_FLAGS}" fi if test "x${MY_AVX512F}" != x ; then MY_AVX512F="${MY_AVX512F}: ${TMP} ${AVX512F_FLAGS}" fi else MY_SSE2="" MY_SSE3="" MY_SSSE3="" MY_SSE41="" MY_AVX="" MY_AVX2="" MY_AVX512F="" fi #AC_SUBST(MY_CUDA) ac_config_files="$ac_config_files src/Makevars" cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure # tests run on this system so they can be shared between configure # scripts and configure runs, see configure's option --config-cache. # It is not useful on other systems. If it contains results you don't # want to keep, you may remove or edit it. # # config.status only pays attention to the cache file if you give it # the --recheck option to rerun configure. # # `ac_cv_env_foo' variables (set or unset) will be overridden when # loading this file, other *unset* `ac_cv_foo' will be assigned the # following values. _ACEOF # The following way of writing the cache mishandles newlines in values, # but we know of no workaround that is simple, portable, and efficient. # So, we kill variables containing newlines. # Ultrix sh set writes to stderr and can't be redirected directly, # and sets the high bit in the cache file unless we assign to the vars. ( for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do eval ac_val=\$$ac_var case $ac_val in #( *${as_nl}*) case $ac_var in #( *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; esac case $ac_var in #( _ | IFS | as_nl) ;; #( BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( *) { eval $ac_var=; unset $ac_var;} ;; esac ;; esac done (set) 2>&1 | case $as_nl`(ac_space=' '; set) 2>&1` in #( *${as_nl}ac_space=\ *) # `set' does not quote correctly, so add quotes: double-quote # substitution turns \\\\ into \\, and sed turns \\ into \. sed -n \ "s/'/'\\\\''/g; s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p" ;; #( *) # `set' quotes correctly as required by POSIX, so do not add quotes. sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" ;; esac | sort ) | sed ' /^ac_cv_env_/b end t clear :clear s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/ t end s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/ :end' >>confcache if diff "$cache_file" confcache >/dev/null 2>&1; then :; else if test -w "$cache_file"; then if test "x$cache_file" != "x/dev/null"; then { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5 $as_echo "$as_me: updating cache $cache_file" >&6;} if test ! -f "$cache_file" || test -h "$cache_file"; then cat confcache >"$cache_file" else case $cache_file in #( */* | ?:*) mv -f confcache "$cache_file"$$ && mv -f "$cache_file"$$ "$cache_file" ;; #( *) mv -f confcache "$cache_file" ;; esac fi fi else { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5 $as_echo "$as_me: not updating unwritable cache $cache_file" >&6;} fi fi rm -f confcache test "x$prefix" = xNONE && prefix=$ac_default_prefix # Let make expand exec_prefix. test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' # Transform confdefs.h into DEFS. # Protect against shell expansion while executing Makefile rules. # Protect against Makefile macro expansion. # # If the first sed substitution is executed (which looks for macros that # take arguments), then branch to the quote section. Otherwise, # look for a macro that doesn't take arguments. ac_script=' :mline /\\$/{ N s,\\\n,, b mline } t clear :clear s/^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\)/-D\1=\2/g t quote s/^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)/-D\1=\2/g t quote b any :quote s/[ `~#$^&*(){}\\|;'\''"<>?]/\\&/g s/\[/\\&/g s/\]/\\&/g s/\$/$$/g H :any ${ g s/^\n// s/\n/ /g p } ' DEFS=`sed -n "$ac_script" confdefs.h` ac_libobjs= ac_ltlibobjs= U= for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue # 1. Remove the extension, and $U if already installed. ac_script='s/\$U\././;s/\.o$//;s/\.obj$//' ac_i=`$as_echo "$ac_i" | sed "$ac_script"` # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR # will be set to the directory where LIBOBJS objects are built. as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext" as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo' done LIBOBJS=$ac_libobjs LTLIBOBJS=$ac_ltlibobjs : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 ac_clean_files_save=$ac_clean_files ac_clean_files="$ac_clean_files $CONFIG_STATUS" { $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5 $as_echo "$as_me: creating $CONFIG_STATUS" >&6;} as_write_fail=0 cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1 #! $SHELL # Generated by $as_me. # Run this file to recreate the current configuration. # Compiler output produced by configure, useful for debugging # configure, is in config.log if it exists. debug=false ac_cs_recheck=false ac_cs_silent=false SHELL=\${CONFIG_SHELL-$SHELL} export SHELL _ASEOF cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1 ## -------------------- ## ## M4sh Initialization. ## ## -------------------- ## # Be more Bourne compatible DUALCASE=1; export DUALCASE # for MKS sh if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : emulate sh NULLCMD=: # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which # is contrary to our usage. Disable this feature. alias -g '${1+"$@"}'='"$@"' setopt NO_GLOB_SUBST else case `(set -o) 2>/dev/null` in #( *posix*) : set -o posix ;; #( *) : ;; esac fi as_nl=' ' export as_nl # Printing a long string crashes Solaris 7 /usr/bin/printf. as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo # Prefer a ksh shell builtin over an external printf program on Solaris, # but without wasting forks for bash or zsh. if test -z "$BASH_VERSION$ZSH_VERSION" \ && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then as_echo='print -r --' as_echo_n='print -rn --' elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then as_echo='printf %s\n' as_echo_n='printf %s' else if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' as_echo_n='/usr/ucb/echo -n' else as_echo_body='eval expr "X$1" : "X\\(.*\\)"' as_echo_n_body='eval arg=$1; case $arg in #( *"$as_nl"*) expr "X$arg" : "X\\(.*\\)$as_nl"; arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; esac; expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" ' export as_echo_n_body as_echo_n='sh -c $as_echo_n_body as_echo' fi export as_echo_body as_echo='sh -c $as_echo_body as_echo' fi # The user is always right. if test "${PATH_SEPARATOR+set}" != set; then PATH_SEPARATOR=: (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || PATH_SEPARATOR=';' } fi # IFS # We need space, tab and new line, in precisely that order. Quoting is # there to prevent editors from complaining about space-tab. # (If _AS_PATH_WALK were called with IFS unset, it would disable word # splitting by setting IFS to empty value.) IFS=" "" $as_nl" # Find who we are. Look in the path if we contain no directory separator. as_myself= case $0 in #(( *[\\/]* ) as_myself=$0 ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break done IFS=$as_save_IFS ;; esac # We did not find ourselves, most probably we were run as `sh COMMAND' # in which case we are not to be found in the path. if test "x$as_myself" = x; then as_myself=$0 fi if test ! -f "$as_myself"; then $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 exit 1 fi # Unset variables that we do not need and which cause bugs (e.g. in # pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" # suppresses any "Segmentation fault" message there. '((' could # trigger a bug in pdksh 5.2.14. for as_var in BASH_ENV ENV MAIL MAILPATH do eval test x\${$as_var+set} = xset \ && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : done PS1='$ ' PS2='> ' PS4='+ ' # NLS nuisances. LC_ALL=C export LC_ALL LANGUAGE=C export LANGUAGE # CDPATH. (unset CDPATH) >/dev/null 2>&1 && unset CDPATH # as_fn_error STATUS ERROR [LINENO LOG_FD] # ---------------------------------------- # Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are # provided, also output the error to LOG_FD, referencing LINENO. Then exit the # script with STATUS, using 1 if that was 0. as_fn_error () { as_status=$1; test $as_status -eq 0 && as_status=1 if test "$4"; then as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 fi $as_echo "$as_me: error: $2" >&2 as_fn_exit $as_status } # as_fn_error # as_fn_set_status STATUS # ----------------------- # Set $? to STATUS, without forking. as_fn_set_status () { return $1 } # as_fn_set_status # as_fn_exit STATUS # ----------------- # Exit the shell with STATUS, even in a "trap 0" or "set -e" context. as_fn_exit () { set +e as_fn_set_status $1 exit $1 } # as_fn_exit # as_fn_unset VAR # --------------- # Portably unset VAR. as_fn_unset () { { eval $1=; unset $1;} } as_unset=as_fn_unset # as_fn_append VAR VALUE # ---------------------- # Append the text in VALUE to the end of the definition contained in VAR. Take # advantage of any shell optimizations that allow amortized linear growth over # repeated appends, instead of the typical quadratic growth present in naive # implementations. if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : eval 'as_fn_append () { eval $1+=\$2 }' else as_fn_append () { eval $1=\$$1\$2 } fi # as_fn_append # as_fn_arith ARG... # ------------------ # Perform arithmetic evaluation on the ARGs, and store the result in the # global $as_val. Take advantage of shells that can avoid forks. The arguments # must be portable across $(()) and expr. if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : eval 'as_fn_arith () { as_val=$(( $* )) }' else as_fn_arith () { as_val=`expr "$@" || test $? -eq 1` } fi # as_fn_arith if expr a : '\(a\)' >/dev/null 2>&1 && test "X`expr 00001 : '.*\(...\)'`" = X001; then as_expr=expr else as_expr=false fi if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then as_basename=basename else as_basename=false fi if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then as_dirname=dirname else as_dirname=false fi as_me=`$as_basename -- "$0" || $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ X"$0" : 'X\(//\)$' \| \ X"$0" : 'X\(/\)' \| . 2>/dev/null || $as_echo X/"$0" | sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/ q } /^X\/\(\/\/\)$/{ s//\1/ q } /^X\/\(\/\).*/{ s//\1/ q } s/.*/./; q'` # Avoid depending upon Character Ranges. as_cr_letters='abcdefghijklmnopqrstuvwxyz' as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' as_cr_Letters=$as_cr_letters$as_cr_LETTERS as_cr_digits='0123456789' as_cr_alnum=$as_cr_Letters$as_cr_digits ECHO_C= ECHO_N= ECHO_T= case `echo -n x` in #((((( -n*) case `echo 'xy\c'` in *c*) ECHO_T=' ';; # ECHO_T is single tab character. xy) ECHO_C='\c';; *) echo `echo ksh88 bug on AIX 6.1` > /dev/null ECHO_T=' ';; esac;; *) ECHO_N='-n';; esac rm -f conf$$ conf$$.exe conf$$.file if test -d conf$$.dir; then rm -f conf$$.dir/conf$$.file else rm -f conf$$.dir mkdir conf$$.dir 2>/dev/null fi if (echo >conf$$.file) 2>/dev/null; then if ln -s conf$$.file conf$$ 2>/dev/null; then as_ln_s='ln -s' # ... but there are two gotchas: # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. # In both cases, we have to default to `cp -pR'. ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || as_ln_s='cp -pR' elif ln conf$$.file conf$$ 2>/dev/null; then as_ln_s=ln else as_ln_s='cp -pR' fi else as_ln_s='cp -pR' fi rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file rmdir conf$$.dir 2>/dev/null # as_fn_mkdir_p # ------------- # Create "$as_dir" as a directory, including parents if necessary. as_fn_mkdir_p () { case $as_dir in #( -*) as_dir=./$as_dir;; esac test -d "$as_dir" || eval $as_mkdir_p || { as_dirs= while :; do case $as_dir in #( *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( *) as_qdir=$as_dir;; esac as_dirs="'$as_qdir' $as_dirs" as_dir=`$as_dirname -- "$as_dir" || $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_dir" : 'X\(//\)[^/]' \| \ X"$as_dir" : 'X\(//\)$' \| \ X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || $as_echo X"$as_dir" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q } /^X\(\/\/\)[^/].*/{ s//\1/ q } /^X\(\/\/\)$/{ s//\1/ q } /^X\(\/\).*/{ s//\1/ q } s/.*/./; q'` test -d "$as_dir" && break done test -z "$as_dirs" || eval "mkdir $as_dirs" } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" } # as_fn_mkdir_p if mkdir -p . 2>/dev/null; then as_mkdir_p='mkdir -p "$as_dir"' else test -d ./-p && rmdir ./-p as_mkdir_p=false fi # as_fn_executable_p FILE # ----------------------- # Test if FILE is an executable regular file. as_fn_executable_p () { test -f "$1" && test -x "$1" } # as_fn_executable_p as_test_x='test -x' as_executable_p=as_fn_executable_p # Sed expression to map a string onto a valid CPP name. as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" # Sed expression to map a string onto a valid variable name. as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" exec 6>&1 ## ----------------------------------- ## ## Main body of $CONFIG_STATUS script. ## ## ----------------------------------- ## _ASEOF test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1 cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # Save the log message, to keep $0 and so on meaningful, and to # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" This file was extended by RandomFieldsUtils $as_me 1.0, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES CONFIG_HEADERS = $CONFIG_HEADERS CONFIG_LINKS = $CONFIG_LINKS CONFIG_COMMANDS = $CONFIG_COMMANDS $ $0 $@ on `(hostname || uname -n) 2>/dev/null | sed 1q` " _ACEOF case $ac_config_files in *" "*) set x $ac_config_files; shift; ac_config_files=$*;; esac cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 # Files that config.status was made for. config_files="$ac_config_files" _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 ac_cs_usage="\ \`$as_me' instantiates files and other configuration actions from templates according to the current configuration. Unless the files and actions are specified as TAGs, all are instantiated by default. Usage: $0 [OPTION]... [TAG]... -h, --help print this help, then exit -V, --version print version number and configuration settings, then exit --config print configuration, then exit -q, --quiet, --silent do not print progress messages -d, --debug don't remove temporary files --recheck update $as_me by reconfiguring in the same conditions --file=FILE[:TEMPLATE] instantiate the configuration file FILE Configuration files: $config_files Report bugs to the package provider." _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ RandomFieldsUtils config.status 1.0 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" Copyright (C) 2012 Free Software Foundation, Inc. This config.status script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it." ac_pwd='$ac_pwd' srcdir='$srcdir' test -n "\$AWK" || AWK=awk _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # The default lists apply if the user does not specify any file. ac_need_defaults=: while test $# != 0 do case $1 in --*=?*) ac_option=`expr "X$1" : 'X\([^=]*\)='` ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'` ac_shift=: ;; --*=) ac_option=`expr "X$1" : 'X\([^=]*\)='` ac_optarg= ac_shift=: ;; *) ac_option=$1 ac_optarg=$2 ac_shift=shift ;; esac case $ac_option in # Handling of the options. -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) ac_cs_recheck=: ;; --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) $as_echo "$ac_cs_version"; exit ;; --config | --confi | --conf | --con | --co | --c ) $as_echo "$ac_cs_config"; exit ;; --debug | --debu | --deb | --de | --d | -d ) debug=: ;; --file | --fil | --fi | --f ) $ac_shift case $ac_optarg in *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; '') as_fn_error $? "missing file argument" ;; esac as_fn_append CONFIG_FILES " '$ac_optarg'" ac_need_defaults=false;; --he | --h | --help | --hel | -h ) $as_echo "$ac_cs_usage"; exit ;; -q | -quiet | --quiet | --quie | --qui | --qu | --q \ | -silent | --silent | --silen | --sile | --sil | --si | --s) ac_cs_silent=: ;; # This is an error. -*) as_fn_error $? "unrecognized option: \`$1' Try \`$0 --help' for more information." ;; *) as_fn_append ac_config_targets " $1" ac_need_defaults=false ;; esac shift done ac_configure_extra_args= if $ac_cs_silent; then exec 6>/dev/null ac_configure_extra_args="$ac_configure_extra_args --silent" fi _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 if \$ac_cs_recheck; then set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion shift \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6 CONFIG_SHELL='$SHELL' export CONFIG_SHELL exec "\$@" fi _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 exec 5>>config.log { echo sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX ## Running $as_me. ## _ASBOX $as_echo "$ac_log" } >&5 _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # Handling of arguments. for ac_config_target in $ac_config_targets do case $ac_config_target in "src/Makevars") CONFIG_FILES="$CONFIG_FILES src/Makevars" ;; *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; esac done # If the user did not use the arguments to specify the items to instantiate, # then the envvar interface is used. Set only those that are not. # We use the long form for the default assignment because of an extremely # bizarre bug on SunOS 4.1.3. if $ac_need_defaults; then test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files fi # Have a temporary directory for convenience. Make it in the build tree # simply because there is no reason against having it here, and in addition, # creating and moving files from /tmp can sometimes cause problems. # Hook for its removal unless debugging. # Note that there is a small window in which the directory will not be cleaned: # after its creation but before its name has been assigned to `$tmp'. $debug || { tmp= ac_tmp= trap 'exit_status=$? : "${ac_tmp:=$tmp}" { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status ' 0 trap 'as_fn_exit 1' 1 2 13 15 } # Create a (secure) tmp directory for tmp files. { tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` && test -d "$tmp" } || { tmp=./conf$$-$RANDOM (umask 077 && mkdir "$tmp") } || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5 ac_tmp=$tmp # Set up the scripts for CONFIG_FILES section. # No need to generate them if there are no CONFIG_FILES. # This happens for instance with `./config.status config.h'. if test -n "$CONFIG_FILES"; then ac_cr=`echo X | tr X '\015'` # On cygwin, bash can eat \r inside `` if the user requested igncr. # But we know of no other shell where ac_cr would be empty at this # point, so we can use a bashism as a fallback. if test "x$ac_cr" = x; then eval ac_cr=\$\'\\r\' fi ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' /dev/null` if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then ac_cs_awk_cr='\\r' else ac_cs_awk_cr=$ac_cr fi echo 'BEGIN {' >"$ac_tmp/subs1.awk" && _ACEOF { echo "cat >conf$$subs.awk <<_ACEOF" && echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' && echo "_ACEOF" } >conf$$subs.sh || as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'` ac_delim='%!_!# ' for ac_last_try in false false false false false :; do . ./conf$$subs.sh || as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X` if test $ac_delim_n = $ac_delim_num; then break elif $ac_last_try; then as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 else ac_delim="$ac_delim!$ac_delim _$ac_delim!! " fi done rm -f conf$$subs.sh cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK && _ACEOF sed -n ' h s/^/S["/; s/!.*/"]=/ p g s/^[^!]*!// :repl t repl s/'"$ac_delim"'$// t delim :nl h s/\(.\{148\}\)..*/\1/ t more1 s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/ p n b repl :more1 s/["\\]/\\&/g; s/^/"/; s/$/"\\/ p g s/.\{148\}// t nl :delim h s/\(.\{148\}\)..*/\1/ t more2 s/["\\]/\\&/g; s/^/"/; s/$/"/ p b :more2 s/["\\]/\\&/g; s/^/"/; s/$/"\\/ p g s/.\{148\}// t delim ' >$CONFIG_STATUS || ac_write_fail=1 rm -f conf$$subs.awk cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 _ACAWK cat >>"\$ac_tmp/subs1.awk" <<_ACAWK && for (key in S) S_is_set[key] = 1 FS = "" } { line = $ 0 nfields = split(line, field, "@") substed = 0 len = length(field[1]) for (i = 2; i < nfields; i++) { key = field[i] keylen = length(key) if (S_is_set[key]) { value = S[key] line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3) len += length(value) + length(field[++i]) substed = 1 } else len += 1 + keylen } print line } _ACAWK _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g" else cat fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \ || as_fn_error $? "could not setup config files machinery" "$LINENO" 5 _ACEOF # VPATH may cause trouble with some makes, so we remove sole $(srcdir), # ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and # trailing colons and then remove the whole line if VPATH becomes empty # (actually we leave an empty line to preserve line numbers). if test "x$srcdir" = x.; then ac_vpsub='/^[ ]*VPATH[ ]*=[ ]*/{ h s/// s/^/:/ s/[ ]*$/:/ s/:\$(srcdir):/:/g s/:\${srcdir}:/:/g s/:@srcdir@:/:/g s/^:*// s/:*$// x s/\(=[ ]*\).*/\1/ G s/\n// s/^[^=]*=[ ]*$// }' fi cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 fi # test -n "$CONFIG_FILES" eval set X " :F $CONFIG_FILES " shift for ac_tag do case $ac_tag in :[FHLC]) ac_mode=$ac_tag; continue;; esac case $ac_mode$ac_tag in :[FHL]*:*);; :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;; :[FH]-) ac_tag=-:-;; :[FH]*) ac_tag=$ac_tag:$ac_tag.in;; esac ac_save_IFS=$IFS IFS=: set x $ac_tag IFS=$ac_save_IFS shift ac_file=$1 shift case $ac_mode in :L) ac_source=$1;; :[FH]) ac_file_inputs= for ac_f do case $ac_f in -) ac_f="$ac_tmp/stdin";; *) # Look for the file first in the build tree, then in the source tree # (if the path is not absolute). The absolute path cannot be DOS-style, # because $ac_f cannot contain `:'. test -f "$ac_f" || case $ac_f in [\\/$]*) false;; *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";; esac || as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;; esac case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac as_fn_append ac_file_inputs " '$ac_f'" done # Let's still pretend it is `configure' which instantiates (i.e., don't # use $as_me), people would be surprised to read: # /* config.h. Generated by config.status. */ configure_input='Generated from '` $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g' `' by configure.' if test x"$ac_file" != x-; then configure_input="$ac_file. $configure_input" { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5 $as_echo "$as_me: creating $ac_file" >&6;} fi # Neutralize special characters interpreted by sed in replacement strings. case $configure_input in #( *\&* | *\|* | *\\* ) ac_sed_conf_input=`$as_echo "$configure_input" | sed 's/[\\\\&|]/\\\\&/g'`;; #( *) ac_sed_conf_input=$configure_input;; esac case $ac_tag in *:-:* | *:-) cat >"$ac_tmp/stdin" \ || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;; esac ;; esac ac_dir=`$as_dirname -- "$ac_file" || $as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$ac_file" : 'X\(//\)[^/]' \| \ X"$ac_file" : 'X\(//\)$' \| \ X"$ac_file" : 'X\(/\)' \| . 2>/dev/null || $as_echo X"$ac_file" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/ q } /^X\(\/\/\)[^/].*/{ s//\1/ q } /^X\(\/\/\)$/{ s//\1/ q } /^X\(\/\).*/{ s//\1/ q } s/.*/./; q'` as_dir="$ac_dir"; as_fn_mkdir_p ac_builddir=. case "$ac_dir" in .) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; *) ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` # A ".." for each directory in $ac_dir_suffix. ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` case $ac_top_builddir_sub in "") ac_top_builddir_sub=. ac_top_build_prefix= ;; *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; esac ;; esac ac_abs_top_builddir=$ac_pwd ac_abs_builddir=$ac_pwd$ac_dir_suffix # for backward compatibility: ac_top_builddir=$ac_top_build_prefix case $srcdir in .) # We are building in place. ac_srcdir=. ac_top_srcdir=$ac_top_builddir_sub ac_abs_top_srcdir=$ac_pwd ;; [\\/]* | ?:[\\/]* ) # Absolute name. ac_srcdir=$srcdir$ac_dir_suffix; ac_top_srcdir=$srcdir ac_abs_top_srcdir=$srcdir ;; *) # Relative name. ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix ac_top_srcdir=$ac_top_build_prefix$srcdir ac_abs_top_srcdir=$ac_pwd/$srcdir ;; esac ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix case $ac_mode in :F) # # CONFIG_FILE # _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # If the template does not know about datarootdir, expand it. # FIXME: This hack should be removed a few years after 2.60. ac_datarootdir_hack=; ac_datarootdir_seen= ac_sed_dataroot=' /datarootdir/ { p q } /@datadir@/p /@docdir@/p /@infodir@/p /@localedir@/p /@mandir@/p' case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in *datarootdir*) ac_datarootdir_seen=yes;; *@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 $as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_datarootdir_hack=' s&@datadir@&$datadir&g s&@docdir@&$docdir&g s&@infodir@&$infodir&g s&@localedir@&$localedir&g s&@mandir@&$mandir&g s&\\\${datarootdir}&$datarootdir&g' ;; esac _ACEOF # Neutralize VPATH when `$srcdir' = `.'. # Shell code in configure.ac might set extrasub. # FIXME: do we really want to maintain this feature? cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_sed_extra="$ac_vpsub $extrasub _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 :t /@[a-zA-Z_][a-zA-Z_0-9]*@/!b s|@configure_input@|$ac_sed_conf_input|;t t s&@top_builddir@&$ac_top_builddir_sub&;t t s&@top_build_prefix@&$ac_top_build_prefix&;t t s&@srcdir@&$ac_srcdir&;t t s&@abs_srcdir@&$ac_abs_srcdir&;t t s&@top_srcdir@&$ac_top_srcdir&;t t s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t s&@builddir@&$ac_builddir&;t t s&@abs_builddir@&$ac_abs_builddir&;t t s&@abs_top_builddir@&$ac_abs_top_builddir&;t t $ac_datarootdir_hack " eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \ >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5 test -z "$ac_datarootdir_hack$ac_datarootdir_seen" && { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } && { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' \ "$ac_tmp/out"`; test -z "$ac_out"; } && { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir' which seems to be undefined. Please make sure it is defined" >&5 $as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' which seems to be undefined. Please make sure it is defined" >&2;} rm -f "$ac_tmp/stdin" case $ac_file in -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";; *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";; esac \ || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;; esac done # for ac_tag as_fn_exit 0 _ACEOF ac_clean_files=$ac_clean_files_save test $ac_write_fail = 0 || as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5 # configure is writing to config.log, and then calls config.status. # config.status does its own redirection, appending to config.log. # Unfortunately, on DOS this fails, as config.log is still kept open # by configure, so config.status won't be able to write to it; its # output is simply discarded. So we exec the FD to /dev/null, # effectively closing config.log, so it can be properly (re)opened and # appended to by config.status. When coming back to configure, we # need to make the FD available again. if test "$no_create" != yes; then ac_cs_success=: ac_config_status_args= test "$silent" = yes && ac_config_status_args="$ac_config_status_args --quiet" exec 5>/dev/null $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false exec 5>>config.log # Use ||, not &&, to avoid exiting from the if with $? = 1, which # would make configure fail if this is the last instruction. $ac_cs_success || as_fn_exit 1 fi if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} fi