Kernelheaping/0000755000176200001440000000000014174313054013030 5ustar liggesusersKernelheaping/NAMESPACE0000644000176200001440000000253714135254045014257 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(plot,Kernelheaping) S3method(plot,bivrounding) S3method(summary,Kernelheaping) export(createSim.Kernelheaping) export(dbivr) export(dclass) export(dheaping) export(dshape3dProp) export(dshapebivr) export(dshapebivrProp) export(sim.Kernelheaping) export(simSummary.Kernelheaping) export(toOtherShape) export(tracePlots) import(MASS) import(ks) import(sparr) importFrom(GB2,mlfit.gb2) importFrom(fastmatch,fmatch) importFrom(fitdistrplus,fitdist) importFrom(grDevices,rainbow) importFrom(graphics,box) importFrom(graphics,contour) importFrom(graphics,image) importFrom(graphics,legend) importFrom(graphics,lines) importFrom(graphics,par) importFrom(graphics,plot) importFrom(magrittr,"%>%") importFrom(mvtnorm,dmvnorm) importFrom(parallel,clusterEvalQ) importFrom(parallel,clusterExport) importFrom(parallel,detectCores) importFrom(parallel,makeCluster) importFrom(parallel,parLapply) importFrom(parallel,stopCluster) importFrom(plyr,count) importFrom(plyr,round_any) importFrom(sp,point.in.polygon) importFrom(stats,aggregate) importFrom(stats,density) importFrom(stats,dnorm) importFrom(stats,optim) importFrom(stats,pnorm) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,rgamma) importFrom(stats,rnorm) importFrom(stats,runif) importFrom(stats,sd) Kernelheaping/data/0000755000176200001440000000000014174310677013751 5ustar liggesusersKernelheaping/data/students.rda0000644000176200001440000000773114174310677016322 0ustar liggesusers\=$G:d # v6-HȁoN}fw p1@ȱI!v$F }TU׬Xif|snޛ{yeqy_nqf?k8E]3cvTB>^̱I-Ǿ}Mr9X23`lZ̾}w+G=kkS ?f_J簞>؇|cv>/c>=#?ܝ'^s: :w'\#O}x*s2L]s>~'&MzܯͻսLJ9w&GKo;6'ӷ.o>=ܼؼ^fu+/|2>ogݳߺvf/wwGGGۿ;Yqwu}FWZ_nu=@;?[m/ⓝ#f{֍3诶>x|W_[poB ;o}K}KvKS9.؊ľR:z[~dzR\nɿN/TWu<*Ob/T|k9U^zUqU<?+!xRx |F"O' uUxVz䁏\o81*/Ǩߕ'=)r'rrAU`7}@~+\ ^ WpPu_}nA݁ߐWY}Їy\;RDȧM9'^;~VZpc >B=qtU ͧK #_#dJ]4\ n^1FA ѯ~WO >:u&5?B\>07_mw1gBa䴪!/_J4 oC_;\:oT>u]|Su ) qyu:E\C*ODA0+q,zŋLG}?z:s œ\>8^VVuu(vT]S_m? 58ym"/b'OثW{\\jizȣjoT<}w]]j&_pk_o g<(U]7BBNϿRx jԜ)W[؅B375Pwzn֫KNSsQ>}?PGOAܯAߨ37|m=_ XYcxMDO~䝲zt=t}>r9%ק'$/ȏ5g60N&k#+__,߹jm y6UٗGd~57u$kI+0WXOlˍ?L, na&3&P/ؗGlljTGЗb51W3L8;Td)SbF~ `M{?ssG'~a#},<_"Xӻȑ}%~WxLs]>:y\˕=:T22Zӯm88hȦ_3bSs&.i1c{(:|8L=>?ֵBՀS{h>£dvCUf옄r ׾Z:jF^3#Wד/)Z=yKC|0CKòܙlKr6Ci6B>i{.|uk(6&C1Ȩ"}%*6kHnryA]/&RgE$SFq͓Y"_wH43@}5%&16ÐF]r?X^BXqmx6޳&Bx= Ռ S(&P6{q97=#"O6P ,C=}J)g`j; z\qh,\c76W魅{|ط'T{x[tlKQ*ؽ>RS\K7\پjYP3CN^z^UYWK7}:5 :㭌'(iН :GӒOVPSV^Z+g)^K\R|{Q9 z_Cq<ΖA=H+e]Gr+YWEq(^FldHɷ*vsal%.Ճ}% b%6G pzďHzVn;Wg0t &щ?%:5֌J֍T)mI%껽̺:D6*0dRqp$H{8R}r i]O[\N*R4o8/#(n{E1fSVG2V# Bũ<)./ O8|E~V#ټ*5ŸfpL #VTJEq%&=MGOGCÊuBԓކmG0R+e7nQ?7eHk"oQȨnKꑕ1ۤcZYqIG?Q..8Q:[Gy[oYS|)86J11aw,:#ĥ'?x~8>еFdT97n nb[l?O'U^ZYb[|A>M`} 8t_<f&|鉫_=/8r9|Q/DHY\'|IQt4C /[9ߣtD=1} \item{thresholds}{rounding thresholds} \item{downbias}{Bias parameter used in the simulation} \item{setBias}{if TRUE a rounding Bias parameter is estimated. For values above 0.5, the respondents are more prone to round down, while for values < 0.5 they are more likely to round up} \item{Beta}{Parameter of the probit model for rounding probabilities used in simulation} \item{unequal}{if TRUE a probit model is fitted for the rounding probabilities with log(true value) as regressor} \item{burnin}{burn-in sample size} \item{samples}{sampling iteration size} \item{bw}{bandwidth selector method, defaults to "nrd0" see \code{density} for more options} \item{offset}{location shift parameter used simulation in simulation} \item{boundary}{TRUE for positive only data (no positive density for negative values)} \item{adjust}{as in \code{density}, the user can multiply the bandwidth by a certain factor such that bw=adjust*bw} \item{...}{additional attributes handed over to \code{createSim.Kernelheaping}} } \value{ List of estimation results } \description{ Simulation of heaping correction method } \examples{ \dontrun{Sims1 <- sim.Kernelheaping(simRuns=2, n=500, distribution="norm", rounds=c(1,10,100), thresholds=c(0.3,0.4,0.3), sd=100)} } Kernelheaping/man/Kernelheaping.Rd0000644000176200001440000000216014135254045016646 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/Kernelheaping.R \docType{package} \name{Kernelheaping} \alias{Kernelheaping} \title{Kernel Density Estimation for Heaped Data} \description{ In self-reported or anonymized data the user often encounters heaped data, i.e. data which are rounded (to a possibly different degree of coarseness). While this is mostly a minor problem in parametric density estimation the bias can be very large for non-parametric methods such as kernel density estimation. This package implements a partly Bayesian algorithm treating the true unknown values as additional parameters and estimates the rounding parameters to give a corrected kernel density estimate. It supports various standard bandwidth selection methods. Varying rounding probabilities (depending on the true value) and asymmetric rounding is estimable as well. Additionally, bivariate non-parametric density estimation for rounded data is supported. } \details{ The most important function is \code{\link{dheaping}}. See the help and the attached examples on how to use the package. } Kernelheaping/man/dshapebivr.Rd0000644000176200001440000000574114135254045016231 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{dshapebivr} \alias{dshapebivr} \title{Bivariate Kernel density estimation for data classified in polygons or shapes} \usage{ dshapebivr( data, burnin = 2, samples = 5, adaptive = FALSE, shapefile, gridsize = 200, boundary = FALSE, deleteShapes = NULL, fastWeights = TRUE, numChains = 1, numThreads = 1 ) } \arguments{ \item{data}{data.frame with 3 columns: x-coordinate, y-coordinate (i.e. center of polygon) and number of observations in area.} \item{burnin}{burn-in sample size} \item{samples}{sampling iteration size} \item{adaptive}{TRUE for adaptive kernel density estimation} \item{shapefile}{shapefile with number of polygons equal to nrow(data)} \item{gridsize}{number of evaluation grid points} \item{boundary}{boundary corrected kernel density estimate?} \item{deleteShapes}{shapefile containing areas without observations} \item{fastWeights}{if TRUE weigths for boundary estimation are only computed for first 10 percent of samples to speed up computation} \item{numChains}{number of chains of SEM algorithm} \item{numThreads}{number of threads to be used (only applicable if more than one chains)} } \value{ The function returns a list object with the following objects (besides all input objects): \item{\code{Mestimates}}{kde object containing the corrected density estimate} \item{\code{gridx}}{Vector Grid of x-coordinates on which density is evaluated} \item{\code{gridy}}{Vector Grid of y-coordinates on which density is evaluated} \item{\code{resultDensity}}{Matrix with Estimated Density for each iteration} \item{\code{resultX}}{Matrix of true latent values X estimates} } \description{ Bivariate Kernel density estimation for data classified in polygons or shapes } \examples{ \dontrun{ library(maptools) # Read Shapefile of Berlin Urban Planning Areas (download available from: # https://www.statistik-berlin-brandenburg.de/opendata/RBS_OD_LOR_2015_12.zip) Berlin <- rgdal::readOGR("X:/SomeDir/RBS_OD_LOR_2015_12.shp") #(von daten.berlin.de) # Get Dataset of Berlin Population (download available from: # https://www.statistik-berlin-brandenburg.de/opendata/EWR201512E_Matrix.csv) data <- read.csv2("X:/SomeDir/EWR201512E_Matrix.csv") # Form Dataset for Estimation Process dataIn <- cbind(t(sapply(1:length(Berlin@polygons), function(x) Berlin@polygons[[x]]@labpt)), data$E_E65U80) #Estimate Bivariate Density Est <- dshapebivr(data = dataIn, burnin = 5, samples = 10, adaptive = FALSE, shapefile = Berlin, gridsize = 325, boundary = TRUE)} # Plot Density over Area: \dontrun{breaks <- seq(1E-16,max(Est$Mestimates$estimate),length.out = 20) image.plot(x=Est$Mestimates$eval.points[[1]],y=Est$Mestimates$eval.points[[2]], z=Est$Mestimates$estimate, asp=1, breaks = breaks, col = colorRampPalette(brewer.pal(9,"YlOrRd"))(length(breaks)-1)) plot(Berlin, add=TRUE)} } Kernelheaping/man/toOtherShape.Rd0000644000176200001440000000112114135254045016473 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{toOtherShape} \alias{toOtherShape} \title{Transfer observations to other shape} \usage{ toOtherShape(Mestimates, shapefile) } \arguments{ \item{Mestimates}{Estimation object created by functions dshapebivr and dbivr} \item{shapefile}{The new shapefile for which the observations shall be transferred to} } \value{ The function returns the count, sd and 90%-coverage interval of the observations in the different shapfile: } \description{ Transfer observations to other shape } Kernelheaping/man/tracePlots.Rd0000644000176200001440000000104114135254045016207 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{tracePlots} \alias{tracePlots} \title{Plots some trace plots for the rounding, bias and acceleration (beta) parameters} \usage{ tracePlots(x, ...) } \arguments{ \item{x}{Kernelheaping object produced by \code{dheaping} function} \item{...}{additional arguments given to standard plot function} } \value{ Prints summary statistics } \description{ Plots some trace plots for the rounding, bias and acceleration (beta) parameters } Kernelheaping/man/dclass.Rd0000644000176200001440000000351714135254045015352 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{dclass} \alias{dclass} \title{Kernel density estimation for classified data} \usage{ dclass( xclass, burnin = 2, samples = 5, boundary = FALSE, bw = "nrd0", evalpoints = 200, adjust = 1, dFunc = NULL ) } \arguments{ \item{xclass}{classified values; matrix with two columns: lower and upper value} \item{burnin}{burn-in sample size} \item{samples}{sampling iteration size} \item{boundary}{TRUE for positive only data (no positive density for negative values)} \item{bw}{bandwidth selector method, defaults to "nrd0" see \code{density} for more options} \item{evalpoints}{number of evaluation grid points} \item{adjust}{as in \code{density}, the user can multiply the bandwidth by a certain factor such that bw=adjust*bw} \item{dFunc}{character optional density (with "d", "p" and "q" functions) function name for parametric estimation such as "norm" "gamma" or "lnorm"} } \value{ The function returns a list object with the following objects (besides all input objects): \item{\code{Mestimates}}{kde object containing the corrected density estimate} \item{\code{gridx}}{Vector Grid on which density is evaluated} \item{\code{resultDensity}}{Matrix with Estimated Density for each iteration} \item{\code{resultX}}{Matrix of true latent values X estimates} } \description{ Kernel density estimation for classified data } \examples{ x=rlnorm(500, meanlog = 8, sdlog = 1) classes <- c(0,500,1000,1500,2000,2500,3000,4000,5000,6000,8000,10000,15000,Inf) xclass <- cut(x,breaks=classes) xclass <- cbind(classes[as.numeric(xclass)], classes[as.numeric(xclass) + 1]) densityEst <- dclass(xclass=xclass, burnin=20, samples=50, evalpoints=1000) plot(densityEst$Mestimates~densityEst$gridx ,lwd=2, type = "l") } Kernelheaping/man/students.Rd0000644000176200001440000000101014135254045015734 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/datasets.R \docType{data} \name{students} \alias{students} \title{Student0405} \source{ http://mathfaculty.fullerton.edu/mori/Math120/Data/readme%20datasets/readme.html#Student0405 } \description{ Data collected during 2004 and 2005 from students in statistics classes at a large state university in the northeastern United States. } \references{ Utts, J. M., & Heckard, R. F. (2011). Mind on statistics. Cengage Learning. } Kernelheaping/man/createSim.Kernelheaping.Rd0000644000176200001440000000163514135254045020567 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{createSim.Kernelheaping} \alias{createSim.Kernelheaping} \title{Create heaped data for Simulation} \usage{ createSim.Kernelheaping( n, distribution, rounds, thresholds, offset = 0, downbias = 0.5, Beta = 0, ... ) } \arguments{ \item{n}{sample size} \item{distribution}{name of the distribution where random sampling is available, e.g. "norm"} \item{rounds}{rounding values} \item{thresholds}{rounding thresholds (for Beta=0)} \item{offset}{certain value added to all observed random samples} \item{downbias}{bias parameter} \item{Beta}{acceleration paramter} \item{...}{additional attributes handed over to "rdistribution" (i.e. rnorm, rgamma,..)} } \value{ List of heaped values, true values and input parameters } \description{ Create heaped data for Simulation } Kernelheaping/man/dheaping.Rd0000644000176200001440000000743314135254045015661 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{dheaping} \alias{dheaping} \title{Kernel density estimation for heaped data} \usage{ dheaping( xheaped, rounds, burnin = 5, samples = 10, setBias = FALSE, weights = NULL, bw = "nrd0", boundary = FALSE, unequal = FALSE, random = FALSE, adjust = 1, recall = F, recallParams = c(1/3, 1/3) ) } \arguments{ \item{xheaped}{heaped values from which to estimate density of x} \item{rounds}{rounding values, numeric vector of length >=1} \item{burnin}{burn-in sample size} \item{samples}{sampling iteration size} \item{setBias}{if TRUE a rounding Bias parameter is estimated. For values above 0.5, the respondents are more prone to round down, while for values < 0.5 they are more likely to round up} \item{weights}{optional numeric vector of sampling weights} \item{bw}{bandwidth selector method, defaults to "nrd0" see \code{density} for more options} \item{boundary}{TRUE for positive only data (no positive density for negative values)} \item{unequal}{if TRUE a probit model is fitted for the rounding probabilities with log(true value) as regressor} \item{random}{if TRUE a random effect probit model is fitted for rounding probabilities} \item{adjust}{as in \code{density}, the user can multiply the bandwidth by a certain factor such that bw=adjust*bw} \item{recall}{if TRUE a recall error is introduced to the heaping model} \item{recallParams}{recall error model parameters expression(nu) and expression(eta). Default is c(1/3, 1/3)} } \value{ The function returns a list object with the following objects (besides all input objects): \item{\code{meanPostDensity}}{Vector of Mean Posterior Density} \item{\code{gridx}}{Vector Grid on which density is evaluated} \item{\code{resultDensity}}{Matrix with Estimated Density for each iteration} \item{\code{resultRR}}{Matrix with rounding probability threshold values for each iteration (on probit scale)} \item{\code{resultBias}}{Vector with estimated Bias parameter for each iteration} \item{\code{resultBeta}}{Vector with estimated Beta parameter for each iteration} \item{\code{resultX}}{Matrix of true latent values X estimates} } \description{ Kernel density estimation for heaped data } \examples{ #Simple Rounding ---------------------------------------------------------- xtrue=rnorm(3000) xrounded=round(xtrue) est <- dheaping(xrounded,rounds=1,burnin=20,samples=50) plot(est,trueX=xtrue) ##################### #####Heaping ##################### #Real Data Example ---------------------------------------------------------- # Student learning hours per week data(students) xheaped <- as.numeric(na.omit(students$StudyHrs)) \dontrun{est <- dheaping(xheaped,rounds=c(1,2,5,10), boundary=TRUE, unequal=TRUE,burnin=20,samples=50) plot(est) summary(est)} #Simulate Data ---------------------------------------------------------- Sim1 <- createSim.Kernelheaping(n=500, distribution="norm",rounds=c(1,10,100), thresholds=c(-0.5244005, 0.5244005), sd=100) \dontrun{est <- dheaping(Sim1$xheaped,rounds=Sim1$rounds) plot(est,trueX=Sim1$x)} #Biased rounding Sim2 <- createSim.Kernelheaping(n=500, distribution="gamma",rounds=c(1,2,5,10), thresholds=c(-1.2815516, -0.6744898, 0.3853205),downbias=0.2, shape=4,scale=8,offset=45) \dontrun{est <- dheaping(Sim2$xheaped, rounds=Sim2$rounds, setBias=T, bw="SJ") plot(est, trueX=Sim2$x) summary(est) tracePlots(est)} Sim3 <- createSim.Kernelheaping(n=500, distribution="gamma",rounds=c(1,2,5,10), thresholds=c(1.84, 2.64, 3.05), downbias=0.75, Beta=-0.5, shape=4, scale=8) \dontrun{est <- dheaping(Sim3$xheaped,rounds=Sim3$rounds,boundary=TRUE,unequal=TRUE,setBias=T) plot(est,trueX=Sim3$x)} } Kernelheaping/man/summary.Kernelheaping.Rd0000644000176200001440000000120014135254045020334 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{summary.Kernelheaping} \alias{summary.Kernelheaping} \title{Prints some descriptive statistics (means and quantiles) for the estimated rounding, bias and acceleration (beta) parameters} \usage{ \method{summary}{Kernelheaping}(object, ...) } \arguments{ \item{object}{Kernelheaping object produced by \code{dheaping} function} \item{...}{unused} } \value{ Prints summary statistics } \description{ Prints some descriptive statistics (means and quantiles) for the estimated rounding, bias and acceleration (beta) parameters } Kernelheaping/man/dshape3dProp.Rd0000644000176200001440000000254014135254045016430 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{dshape3dProp} \alias{dshape3dProp} \title{3d Kernel density estimation for data classified in polygons or shapes} \usage{ dshape3dProp( data, burnin = 2, samples = 5, shapefile, gridsize = 200, boundary = FALSE, deleteShapes = NULL, fastWeights = TRUE, numChains = 1, numThreads = 1 ) } \arguments{ \item{data}{data.frame with 5 columns: x-coordinate, y-coordinate (i.e. center of polygon) and number of observations in area for partial population and number of observations for complete observations and third variable (numeric).} \item{burnin}{burn-in sample size} \item{samples}{sampling iteration size} \item{shapefile}{shapefile with number of polygons equal to nrow(data) / length(unique(data[,5]))} \item{gridsize}{number of evaluation grid points} \item{boundary}{boundary corrected kernel density estimate?} \item{deleteShapes}{shapefile containing areas without observations} \item{fastWeights}{if TRUE weigths for boundary estimation are only computed for first 10 percent of samples to speed up computation} \item{numChains}{number of chains of SEM algorithm} \item{numThreads}{number of threads to be used (only applicable if more than one chains)} } \description{ 3d Kernel density estimation for data classified in polygons or shapes } Kernelheaping/man/dbivr.Rd0000644000176200001440000000464214135254045015207 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{dbivr} \alias{dbivr} \title{Bivariate kernel density estimation for rounded data} \usage{ dbivr( xrounded, roundvalue, burnin = 2, samples = 5, adaptive = FALSE, gridsize = 200 ) } \arguments{ \item{xrounded}{rounded values from which to estimate bivariate density, matrix with 2 columns (x,y)} \item{roundvalue}{rounding value (side length of square in that the true value lies around the rounded one)} \item{burnin}{burn-in sample size} \item{samples}{sampling iteration size} \item{adaptive}{set to TRUE for adaptive bandwidth} \item{gridsize}{number of evaluation grid points} } \value{ The function returns a list object with the following objects (besides all input objects): \item{\code{Mestimates}}{kde object containing the corrected density estimate} \item{\code{gridx}}{Vector Grid on which density is evaluated (x)} \item{\code{gridy}}{Vector Grid on which density is evaluated (y)} \item{\code{resultDensity}}{Array with Estimated Density for each iteration} \item{\code{resultX}}{Matrix of true latent values X estimates} \item{\code{delaigle}}{Matrix of Delaigle estimator estimates} } \description{ Bivariate kernel density estimation for rounded data } \examples{ # Create Mu and Sigma ----------------------------------------------------------- mu1 <- c(0, 0) mu2 <- c(5, 3) mu3 <- c(-4, 1) Sigma1 <- matrix(c(4, 3, 3, 4), 2, 2) Sigma2 <- matrix(c(3, 0.5, 0.5, 1), 2, 2) Sigma3 <- matrix(c(5, 4, 4, 6), 2, 2) # Mixed Normal Distribution ------------------------------------------------------- mus <- rbind(mu1, mu2, mu3) Sigmas <- rbind(Sigma1, Sigma2, Sigma3) props <- c(1/3, 1/3, 1/3) \dontrun{xtrue=rmvnorm.mixt(n=1000, mus=mus, Sigmas=Sigmas, props=props) roundvalue=2 xrounded=plyr::round_any(xtrue,roundvalue) est <- dbivr(xrounded,roundvalue=roundvalue,burnin=5,samples=10) #Plot corrected and Naive distribution plot(est,trueX=xtrue) #for comparison: plot true density dens=dmvnorm.mixt(x=expand.grid(est$Mestimates$eval.points[[1]],est$Mestimates$eval.points[[2]]), mus=mus, Sigmas=Sigmas, props=props) dens=matrix(dens,nrow=length(est$gridx),ncol=length(est$gridy)) contour(dens,x=est$Mestimates$eval.points[[1]],y=est$Mestimates$eval.points[[2]], xlim=c(min(est$gridx),max(est$gridx)),ylim=c(min(est$gridy),max(est$gridy)),main="True Density")} } Kernelheaping/man/dshapebivrProp.Rd0000644000176200001440000000514314135254045017066 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{dshapebivrProp} \alias{dshapebivrProp} \title{Bivariate Kernel density estimation for data classified in polygons or shapes} \usage{ dshapebivrProp( data, burnin = 2, samples = 5, adaptive = FALSE, shapefile, gridsize = 200, boundary = FALSE, deleteShapes = NULL, fastWeights = TRUE, numChains = 1, numThreads = 1 ) } \arguments{ \item{data}{data.frame with 4 columns: x-coordinate, y-coordinate (i.e. center of polygon) and number of observations in area for partial population and number of observations for complete observations.} \item{burnin}{burn-in sample size} \item{samples}{sampling iteration size} \item{adaptive}{TRUE for adaptive kernel density estimation} \item{shapefile}{shapefile with number of polygons equal to nrow(data)} \item{gridsize}{number of evaluation grid points} \item{boundary}{boundary corrected kernel density estimate?} \item{deleteShapes}{shapefile containing areas without observations} \item{fastWeights}{if TRUE weigths for boundary estimation are only computed for first 10 percent of samples to speed up computation} \item{numChains}{number of chains of SEM algorithm} \item{numThreads}{number of threads to be used (only applicable if more than one chains)} } \description{ Bivariate Kernel density estimation for data classified in polygons or shapes } \examples{ \dontrun{ library(maptools) # Read Shapefile of Berlin Urban Planning Areas (download available from: https://www.statistik-berlin-brandenburg.de/opendata/RBS_OD_LOR_2015_12.zip) Berlin <- rgdal::readOGR("X:/SomeDir/RBS_OD_LOR_2015_12.shp") #(von daten.berlin.de) # Get Dataset of Berlin Population (download available from: # https://www.statistik-berlin-brandenburg.de/opendata/EWR201512E_Matrix.csv) data <- read.csv2("X:/SomeDir/EWR201512E_Matrix.csv") # Form Dataset for Estimation Process dataIn <- cbind(t(sapply(1:length(Berlin@polygons), function(x) Berlin@polygons[[x]]@labpt)), data$E_E65U80, data$E_E) #Estimate Bivariate Proportions (may take some minutes) PropEst <- dshapebivrProp(data = dataIn, burnin = 5, samples = 20, adaptive = FALSE, shapefile = Berlin, gridsize=325, numChains = 16, numThreads = 4)} # Plot Proportions over Area: \dontrun{ breaks <- seq(0,0.4,by=0.025) image.plot(x=PropEst$Mestimates$eval.points[[1]],y=PropEst$Mestimates$eval.points[[2]], z=PropEst$proportion+1E-96, asp=1, breaks = breaks, col = colorRampPalette(brewer.pal(9,"YlOrRd"))(length(breaks)-1)) plot(Berlin, add=TRUE)} } Kernelheaping/man/plot.bivrounding.Rd0000644000176200001440000000142014135254045017373 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{plot.bivrounding} \alias{plot.bivrounding} \title{Plot Kernel density estimate of heaped data naively and corrected by partly bayesian model} \usage{ \method{plot}{bivrounding}(x, trueX = NULL, ...) } \arguments{ \item{x}{bivrounding object produced by \code{dbivr} function} \item{trueX}{optional, if true values X are known (in simulations, for example) the 'Oracle' density estimate is added as well} \item{...}{additional arguments given to standard plot function} } \value{ plot with Kernel density estimates (Naive, Corrected and True (if provided)) } \description{ Plot Kernel density estimate of heaped data naively and corrected by partly bayesian model } Kernelheaping/man/simSummary.Kernelheaping.Rd0000644000176200001440000000072614135254045021021 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{simSummary.Kernelheaping} \alias{simSummary.Kernelheaping} \title{Simulation Summary} \usage{ simSummary.Kernelheaping(sim, coverage = 0.9) } \arguments{ \item{sim}{Simulation object returned from sim.Kernelheaping} \item{coverage}{probability for computing coverage intervals} } \value{ list with summary statistics } \description{ Simulation Summary } Kernelheaping/man/plot.Kernelheaping.Rd0000644000176200001440000000143314135254045017625 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/functions.R \name{plot.Kernelheaping} \alias{plot.Kernelheaping} \title{Plot Kernel density estimate of heaped data naively and corrected by partly bayesian model} \usage{ \method{plot}{Kernelheaping}(x, trueX = NULL, ...) } \arguments{ \item{x}{Kernelheaping object produced by \code{dheaping} function} \item{trueX}{optional, if true values X are known (in simulations, for example) the 'Oracle' density estimate is added as well} \item{...}{additional arguments given to standard plot function} } \value{ plot with Kernel density estimates (Naive, Corrected and True (if provided)) } \description{ Plot Kernel density estimate of heaped data naively and corrected by partly bayesian model } Kernelheaping/DESCRIPTION0000644000176200001440000000335414174313054014543 0ustar liggesusersPackage: Kernelheaping Type: Package Title: Kernel Density Estimation for Heaped and Rounded Data Authors@R: c(person("Marcus", "Gross", role = c("aut", "cre"), email = "marcus.gross@inwt-statistics.de"), person("Lukas", "Fuchs", role = "aut", email = "f.lukas@live.de"), person("Kerstin", "Erfurth", role = "ctb", email = "keri.erfurth@gmail.com")) Version: 2.3.0 Date: 2022-01-26 Depends: R (>= 2.15.0), MASS, ks, sparr Imports: sp, plyr, dplyr, fastmatch, fitdistrplus, GB2, magrittr, mvtnorm Author: Marcus Gross [aut, cre], Lukas Fuchs [aut], Kerstin Erfurth [ctb] Maintainer: Marcus Gross Description: In self-reported or anonymised data the user often encounters heaped data, i.e. data which are rounded (to a possibly different degree of coarseness). While this is mostly a minor problem in parametric density estimation the bias can be very large for non-parametric methods such as kernel density estimation. This package implements a partly Bayesian algorithm treating the true unknown values as additional parameters and estimates the rounding parameters to give a corrected kernel density estimate. It supports various standard bandwidth selection methods. Varying rounding probabilities (depending on the true value) and asymmetric rounding is estimable as well: Gross, M. and Rendtel, U. (2016) (). Additionally, bivariate non-parametric density estimation for rounded data, Gross, M. et al. (2016) (), as well as data aggregated on areas is supported. License: GPL-2 | GPL-3 RoxygenNote: 7.1.0 NeedsCompilation: no Packaged: 2022-01-26 18:23:59 UTC; M Repository: CRAN Date/Publication: 2022-01-26 18:42:52 UTC Kernelheaping/R/0000755000176200001440000000000014135254045013232 5ustar liggesusersKernelheaping/R/functions.R0000644000176200001440000030553714135254045015402 0ustar liggesusers#' @importFrom grDevices rainbow #' @importFrom graphics box contour image legend lines par plot #' @importFrom stats aggregate density dnorm optim pnorm qnorm quantile rgamma runif rnorm sd #' @importFrom GB2 mlfit.gb2 #' @importFrom fitdistrplus fitdist #' @importFrom sp point.in.polygon #' @importFrom dplyr group_by_all count distinct #' @importFrom plyr round_any count #' @importFrom magrittr "%>%" #' @importFrom mvtnorm dmvnorm #' @importFrom parallel clusterExport makeCluster stopCluster parLapply detectCores clusterEvalQ #' @importFrom fastmatch fmatch # create package environment for logging output kernelheapingEnv <- new.env() assign("logLevelInfo" , TRUE , envir=kernelheapingEnv) assign("logLevelDump" , FALSE, envir=kernelheapingEnv) assign("logLevelTiming", FALSE, envir=kernelheapingEnv) assign("outputLevel" , 1 , envir=kernelheapingEnv) Rprx=function(rounds, Bias, RR, beta, gridx, ranefvalues, roundvalues){ rprobs=rprobsRR2(RR,beta,gridx,ranefvalues) postMatrix=matrix(0,ncol=ncol(rprobs),nrow=2*nrow(rprobs)) for(i in 1:length(rounds)){ modulo=gridx%%rounds[i] lower=which(modulo < 0.5*rounds[i]) higher=which(modulo > 0.5*rounds[i]) postMatrix[i,lower]=Bias*rprobs[i,lower] postMatrix[i+length(rounds),higher]=(1-Bias)*rprobs[i,higher] } if(min(roundvalues)==0){postMatrix[c(1,1+length(rounds)),]=sum(postMatrix[c(1,1+length(rounds)),])/(2*length(gridx))} postMatrix=sweep(postMatrix,2,colSums(postMatrix),`/`, check.margin = FALSE) #\pi(R_i|X_i,\tau,a,\beta) return(postMatrix) } Rprx2=function(postMatrix, rounds, gridx){ possW=seq(from=plyr::round_any(min(gridx),min(rounds)),to=round_any(max(gridx),min(rounds)),by=min(rounds)) numbers=matrix(0,nrow=length(rounds)*2,ncol=length(possW),dimnames=list(NULL,possW)) for(i in 1:length(c(rounds,rounds))){ atemp=factor(plyr::round_any(gridx,accuracy=c(rounds,rounds)[i],f=round),levels=possW) numbers[i,]=tapply(postMatrix[i,],INDEX=list(atemp),FUN=function(x) sum(x,na.rm=T)) } numbers[is.na(numbers)]=0 sweep(numbers,2,colSums(numbers),`/`, check.margin = FALSE)# int \pi(R_i|W_i) dX_i } dbc <- function(gridx, x, bw, boundary=c(0, +Inf)){ kernels <- sapply(x, function(y) { dens <- dnorm(gridx,mean = y,sd = bw) dens[gridxboundary[2]] <- 0 dens }) kernels <- sweep(kernels,2,colSums(kernels),`/`) rowMeans(kernels)/(gridx[2]-gridx[1]) } rprobsRR2=function(RR,beta,gridx,ranefvalues=0){ pgivenX=matrix(pnorm(rep(beta*log(abs(gridx))+ranefvalues,length(RR))+rep(RR,each=length(gridx))), nrow=length(RR),byrow=T) diff(rbind(0,pgivenX,1)) } logLik=function(par, new, rguesstemp, unequal, setBias, rounds,ranefvalues, roundvalues){ RR=sort(par[1:(length(rounds)-1)]) Bias=0 beta=0 if(setBias==TRUE) {Bias=par[length(rounds)]} if(unequal==TRUE) {beta=par[length(par)]} pgivenX=Rprx(rounds=rounds,RR=RR, Bias=pnorm(Bias), beta=beta, gridx=unique(new), ranefvalues=ranefvalues, roundvalues=roundvalues)+1E-96 probs <- log(pgivenX[ cumsum(rep(nrow(pgivenX),length.out=length(new)))-cumsum(((sequence(rle(new)$length)>1)*nrow(pgivenX)))- nrow(pgivenX)+rguesstemp]) return(-sum(probs)) } logLikRandomGamma=function(ranefvalues, RR, Bias, beta, new, rguesstemp, rounds, tau,roundvalues){ pgivenX=Rprx(rounds=rounds,RR=RR, Bias=pnorm(Bias), beta=beta, gridx=unique(new), ranefvalues=ranefvalues,roundvalues=roundvalues) return(sum(log(pgivenX[cumsum(rep(nrow(pgivenX),length.out=length(new)))- cumsum(((sequence(rle(new)$length)>1)*nrow(pgivenX)))- nrow(pgivenX)+rguesstemp]))+sum(log(dnorm(ranefvalues,mean=0,sd=sqrt(tau))))) } #' Kernel density estimation for heaped data #' @param xheaped heaped values from which to estimate density of x #' @param rounds rounding values, numeric vector of length >=1 #' @param burnin burn-in sample size #' @param samples sampling iteration size #' @param setBias if TRUE a rounding Bias parameter is estimated. For values above 0.5, the respondents #' are more prone to round down, while for values < 0.5 they are more likely to round up #' @param bw bandwidth selector method, defaults to "nrd0" see \code{density} for more options #' @param boundary TRUE for positive only data (no positive density for negative values) #' @param unequal if TRUE a probit model is fitted for the rounding probabilities with log(true value) as regressor #' @param random if TRUE a random effect probit model is fitted for rounding probabilities #' @param adjust as in \code{density}, the user can multiply the bandwidth by a certain factor such that bw=adjust*bw #' @param weights optional numeric vector of sampling weights #' @param recall if TRUE a recall error is introduced to the heaping model #' @param recallParams recall error model parameters expression(nu) and expression(eta). Default is c(1/3, 1/3) #' @return #' The function returns a list object with the following objects (besides all input objects): #' \item{\code{meanPostDensity}}{Vector of Mean Posterior Density} #' \item{\code{gridx}}{Vector Grid on which density is evaluated} #' \item{\code{resultDensity}}{Matrix with Estimated Density for each iteration} #' \item{\code{resultRR}}{Matrix with rounding probability threshold values for each iteration (on probit scale)} #' \item{\code{resultBias}}{Vector with estimated Bias parameter for each iteration} #' \item{\code{resultBeta}}{Vector with estimated Beta parameter for each iteration} #' \item{\code{resultX}}{Matrix of true latent values X estimates} #' @examples #' #Simple Rounding ---------------------------------------------------------- #' xtrue=rnorm(3000) #' xrounded=round(xtrue) #' est <- dheaping(xrounded,rounds=1,burnin=20,samples=50) #' plot(est,trueX=xtrue) #' #' ##################### #' #####Heaping #' ##################### #' #Real Data Example ---------------------------------------------------------- #' # Student learning hours per week #' data(students) #' xheaped <- as.numeric(na.omit(students$StudyHrs)) #' \dontrun{est <- dheaping(xheaped,rounds=c(1,2,5,10), boundary=TRUE, unequal=TRUE,burnin=20,samples=50) #' plot(est) #' summary(est)} #' #' #Simulate Data ---------------------------------------------------------- #' Sim1 <- createSim.Kernelheaping(n=500, distribution="norm",rounds=c(1,10,100), #' thresholds=c(-0.5244005, 0.5244005), sd=100) #' \dontrun{est <- dheaping(Sim1$xheaped,rounds=Sim1$rounds) #' plot(est,trueX=Sim1$x)} #' #' #Biased rounding #' Sim2 <- createSim.Kernelheaping(n=500, distribution="gamma",rounds=c(1,2,5,10), #' thresholds=c(-1.2815516, -0.6744898, 0.3853205),downbias=0.2, #' shape=4,scale=8,offset=45) #' \dontrun{est <- dheaping(Sim2$xheaped, rounds=Sim2$rounds, setBias=T, bw="SJ") #' plot(est, trueX=Sim2$x) #' summary(est) #' tracePlots(est)} #' #' Sim3 <- createSim.Kernelheaping(n=500, distribution="gamma",rounds=c(1,2,5,10), #' thresholds=c(1.84, 2.64, 3.05), downbias=0.75, Beta=-0.5, shape=4, scale=8) #' \dontrun{est <- dheaping(Sim3$xheaped,rounds=Sim3$rounds,boundary=TRUE,unequal=TRUE,setBias=T) #' plot(est,trueX=Sim3$x)} #' @export dheaping <- function(xheaped, rounds, burnin=5, samples=10, setBias=FALSE, weights=NULL, bw= "nrd0", boundary=FALSE, unequal=FALSE, random=FALSE, adjust=1, recall=F, recallParams=c(1/3,1/3)){ if(is.null(weights)){weights=rep(1/length(xheaped),length(xheaped))} weights=weights/sum(weights) roundvalues=rounds if(min(rounds)==0){rounds[rounds==0]=0.5*min(rounds[rounds>0])} weights=weights[order(xheaped)] xheaped=xheaped[order(xheaped)] #round to lowest rounding value weights=weights[!is.na(xheaped)] xheaped=xheaped[!is.na(xheaped)] xheapedOriginal <- xheaped xheaped=plyr::round_any(xheaped,accuracy=min(rounds)) #round to lowest rounding value #Create grid stepsize=1/2^(which(diff(range(xheaped))/min(rounds)*2^(1:10)>100)[1]) #at least 100 grid points gridx=seq(min(xheaped)-max(rounds)*0.5+stepsize/2*min(rounds), max(xheaped)+max(rounds)*0.5-stepsize/2*min(rounds), by=min(rounds)*stepsize) if(boundary==TRUE|unequal==TRUE){gridx=gridx[gridx>0]} #Pilot Estimation if(boundary==FALSE){Mestimates <- density(xheaped,from=min(gridx),to=max(gridx),n=length(gridx),bw=max(rounds)*2, weights=weights)$y} if(boundary==TRUE){Mestimates <- dbc(gridx=gridx,x=xheaped,bw=max(rounds)*2)} #Starting values for x(new), Rounding values(rguesstemp), und Rounding thresholds(RR), Bias (Bias) and beta #rguesstemp<-sapply(1:length(xheaped),function(x) rounds[max(which(xheaped[x]%%rounds==0))]) rguesstemp=rep(0,length(xheaped)) heapedvalues=unique(xheaped) xtable<-table(xheaped) #Starting Values new<-xheaped RR<-qnorm(1:(length(rounds)-1)/(length(rounds))) Bias<-0 beta<-0 tau<-0.5 ranefnames=unique(plyr::round_any(gridx,accuracy=min(rounds))) ranef=rnorm(length(ranefnames),sd=sqrt(tau)) ranefvalues=rep(0,length(gridx)) #Result matrices resultDensity=matrix(nrow=samples+burnin,ncol=length(gridx)) resultX=matrix(nrow=samples+burnin,ncol=length(xheaped)) resultRR=matrix(nrow=samples+burnin,ncol=length(rounds)-1) resultBias=matrix(nrow=samples+burnin,ncol=1) resultBeta=matrix(nrow=samples+burnin,ncol=1) resultRandom=matrix(nrow=samples+burnin,ncol=length(ranefnames),dimnames=list(1:(samples+burnin),ranefnames)) resultTau=rep(0,times=samples+burnin) selectionGrid<-lapply(heapedvalues,function(k){ selectionupper=lapply(1:length(rounds), function(x) which(gridx>k-rounds[x]*0.5&gridx<=k)) selectionlower=lapply(1:length(rounds), function(x) which(gridx=k)) selection=c(selectionlower,selectionupper) }) for(j in 1:(burnin+samples)){ Rprs1=Rprx(rounds=rounds,RR=RR, Bias=pnorm(Bias), beta=beta, gridx=gridx, ranefvalues=ranefvalues,roundvalues=roundvalues) Rprs2=Rprx2(Rprs1, rounds=rounds, gridx=gridx)[,as.character(heapedvalues)] if(recall==FALSE){ getNew <- getNewX(heapedvalues, selectionGrid, Mestimates, Rprs1, Rprs2, xtable, xheaped, gridx, xheapedOriginal, roundvalues, rguesstemp, new) } if(recall==TRUE){ getNew <- getNewV(heapedvalues, selectionGrid, Mestimates, Rprs1, Rprs2, xtable, xheaped, gridx, xheapedOriginal, roundvalues, rguesstemp, new, Bias, beta, ranefvalues, RR, recallParams) } new=getNew[[1]] rguesstemp=getNew[[2]] #ordered newx=getNew[[1]][order(getNew[[1]])] rguesstempx=getNew[[2]][order(getNew[[1]])] #Laplace-Approximation: par=RR if(setBias==TRUE){par=c(par,Bias)} if(unequal==TRUE){par=c(par,beta)} if(length(rounds)>1){ ranefvaluestemp=ranefvalues[match(unique(newx),gridx)] laplace=optim(par,logLik,new=newx,rguesstemp=rguesstempx, unequal=unequal, setBias=setBias, ranefvalues=ranefvaluestemp,rounds=rounds ,hessian=T, roundvalues=roundvalues, method="BFGS") par=MASS::mvrnorm(1,mu=laplace$par,Sigma=solve(laplace$hessian+diag(1E-7,length(par)))) # Metropolis-Hastings als Alternative # parNew=par+rnorm(length(par),sd=0.05) # alpha=min(1,exp(-logLik(par = parNew, new=new,rguesstemp=rguesstemp, unequal=unequal, setBias=setBias, # postMatrix=matrix(0,ncol=length(unique(new)),nrow=2*length(rounds),dimnames=list(NULL,unique(new[order(new)]))), # ranefvalues=ranefvaluestemp,rounds=rounds)+ # logLik(par = par, new=new,rguesstemp=rguesstemp, unequal=unequal, setBias=setBias, # postMatrix=matrix(0,ncol=length(unique(new)),nrow=2*length(rounds),dimnames=list(NULL,unique(new[order(new)]))), # ranefvalues=ranefvaluestemp,rounds=rounds) # )) # if(runif(1)0])!=0]=xheapedOriginal[xheapedOriginal%%min(roundvalues[roundvalues>0])!=0] } if(recall==T){ sdV <- recallParams[1]*c(rounds,rounds)[rguesstemp]+ recallParams[2]*sapply(new,function(x) diff(c(0,pnorm(rep(beta*log(abs(x)),length(RR))+RR),1))%*% roundvalues) new=sapply(1:length(new), function(y) sample(x=gridx,size=1, prob=dnorm(gridx,mean=new[y]+qnorm(pnorm(Bias))*sdV[y], sd=sdV[y])*Mestimates)) } h <- density(new,bw=bw)$bw*adjust if(boundary==TRUE){Mestimates <- dbc(gridx=gridx,x=new,bw=h)} if(boundary==FALSE){Mestimates <- density(new,from=min(gridx),to=max(gridx),n=length(gridx),bw=h,weights=weights)$y} #Save Results resultDensity[j,]=Mestimates resultRR[j,]=RR resultBias[j,]=pnorm(Bias) resultBeta[j,]=beta resultX[j,]=new resultTau[j]=tau resultRandom[j,]=ranef print(paste("Iteration:",j,"of", burnin+samples)) } meanPostDensity=apply(resultDensity[-c(1:burnin),],2,mean) est<-list(meanPostDensity=meanPostDensity,resultDensity=resultDensity,resultRR=resultRR, resultBias=resultBias, resultBeta=resultBeta, resultX=resultX, xheaped=xheaped, gridx=gridx, boundary=boundary, rounds=roundvalues, setBias=setBias, unequal=unequal, bw=bw, burnin=burnin, samples=samples, adjust=adjust, resultTau=resultTau, resultRandom=resultRandom, random=random, weights=weights) class(est) <- "Kernelheaping" return(est) } #' Plot Kernel density estimate of heaped data naively and corrected by partly bayesian model #' @param x Kernelheaping object produced by \code{dheaping} function #' @param trueX optional, if true values X are known (in simulations, for example) the 'Oracle' density estimate is added as well #' @param ... additional arguments given to standard plot function #' @return plot with Kernel density estimates (Naive, Corrected and True (if provided)) #' @method plot Kernelheaping #' @export plot.Kernelheaping <- function(x,trueX=NULL, ...){ if(x$boundary==FALSE){ plot(density(x$xheaped,bw=x$bw,adjust=x$adjust, weights=x$weights),xlab="x",ylab="Density",main="", ...) if(!is.null(trueX)){lines(density(trueX,bw=x$bw,adjust=x$adjust, weights=x$weights),col="blue",lty=2,lwd=2)} } if(x$boundary==TRUE){ plot(dbc(gridx=x$gridx,x=x$xheaped,bw=density(x$xheaped,bw=x$bw,adjust=x$adjust)$bw)~x$gridx,type="l",xlab="x",ylab="Density", main="", ...) if(!is.null(trueX)){lines(dbc(x$gridx, trueX, bw=density(trueX,bw=x$bw,adjust=x$adjust)$bw)~x$gridx,col="blue",lty=2,lwd=2)} } lines(x$meanPostDensity~x$gridx,col="red",lty=4,lwd=2) if(is.null(trueX)){legend("topright",c("Naive","Corrected"),col=c("black","red"),lty=c(1,4,2))} if(!is.null(trueX)){legend("topright",c("Naive","Corrected","Oracle"),col=c("black","red","blue"),lty=c(1,4,2),lwd=c(1,2,2))} } #' Prints some descriptive statistics (means and quantiles) for the estimated rounding, bias and acceleration (beta) parameters #' @param object Kernelheaping object produced by \code{dheaping} function #' @param ... unused #' @return Prints summary statistics #' @method summary Kernelheaping #' @export summary.Kernelheaping <- function(object, ...){ Threshold=c(-Inf,colMeans(object$resultRR[-c(1:object$burnin),])) names(Threshold)=object$rounds cat("Mean Posterior Threshold Values:", "\n") cat(Threshold, "\n") if(object$setBias==TRUE){ Bias=c(mean(object$resultBias[-c(1:object$burnin)]),quantile(object$resultBias[-c(1:object$burnin)],c(0.025,0.975))) names(Bias)[1]="Mean" cat("\n", "Bias, Mean and 95% credible interval:", "\n") cat(Bias, "\n") } if(object$unequal==TRUE){ unequal=c(mean(object$resultBeta[-c(1:object$burnin)]),quantile(object$resultBeta[-c(1:object$burnin)],c(0.025,0.975))) names(unequal)[1]="Mean" cat("\n", "Beta, Mean and 95% credible interval:", "\n") cat(unequal, "\n")} } #' Plots some trace plots for the rounding, bias and acceleration (beta) parameters #' @param x Kernelheaping object produced by \code{dheaping} function #' @param ... additional arguments given to standard plot function #' @return Prints summary statistics #' @export tracePlots <- function(x, ...){ par(mfrow=c(2,ceiling(ncol(x$resultRR)/2))) for(i in 1:ncol(x$resultRR)){ plot(x$resultRR[,i],xlab="iteration", ylab="value", main=paste("Traceplot Threshold", i), type="l") } par(mfrow=c(1,1)) if(x$setBias==TRUE){ plot(x$resultBias,xlab="iteration", ylab="value", main="Traceplot Bias", type="l") } if(x$unequal==TRUE){ plot(x$resultBeta,xlab="iteration", ylab="value", main="Traceplot Beta", type="l") } if(x$random==TRUE){ plot(x$resultTau,xlab="iteration", ylab="value", main="Traceplot tau", type="l") } } #' Simulation of heaping correction method #' @param simRuns number of simulations runs #' @param n sample size #' @param distribution name of the distribution where random sampling is available, e.g. "norm" #' @param rounds rounding values, numeric vector of length >=1 #' @param thresholds rounding thresholds #' @param burnin burn-in sample size #' @param samples sampling iteration size #' @param downbias Bias parameter used in the simulation #' @param setBias if TRUE a rounding Bias parameter is estimated. For values above 0.5, the respondents #' are more prone to round down, while for values < 0.5 they are more likely to round up #' @param bw bandwidth selector method, defaults to "nrd0" see \code{density} for more options #' @param offset location shift parameter used simulation in simulation #' @param boundary TRUE for positive only data (no positive density for negative values) #' @param Beta Parameter of the probit model for rounding probabilities used in simulation #' @param unequal if TRUE a probit model is fitted for the rounding probabilities with log(true value) as regressor #' @param adjust as in \code{density}, the user can multiply the bandwidth by a certain factor such that bw=adjust*bw #' @param ... additional attributes handed over to \code{createSim.Kernelheaping} #' @return List of estimation results #' @examples #' \dontrun{Sims1 <- sim.Kernelheaping(simRuns=2, n=500, distribution="norm", #' rounds=c(1,10,100), thresholds=c(0.3,0.4,0.3), sd=100)} #' @export sim.Kernelheaping <- function(simRuns, n, distribution, rounds, thresholds, downbias=0.5, setBias = FALSE, Beta=0, unequal = FALSE, burnin = 5, samples = 10, bw = "nrd0", offset=0, boundary = FALSE, adjust = 1, ...){ lapply(1:simRuns, function(x){ print(x) Sim <- createSim.Kernelheaping(n, distribution, rounds, thresholds, downbias=downbias, Beta=Beta, offset=offset, ...) est <- dheaping(Sim$xheaped, rounds=Sim$rounds, burnin = burnin, samples = samples, setBias = setBias, bw = bw, boundary = boundary, unequal = unequal, adjust = adjust) if(est$boundary==T){ naiveD <- dbc(est$gridx, est$xheaped, density(est$xheaped,from=min(est$gridx),to=max(est$gridx),n=length(est$gridx),bw=est$bw,adjust=est$adjust)$bw) oracleD <- dbc(est$gridx, Sim$x, density(Sim$x,from=min(est$gridx),to=max(est$gridx),n=length(est$gridx),bw=est$bw,adjust=est$adjust)$bw) } if(est$boundary==F){ naiveD <- density(est$xheaped,from=min(est$gridx),to=max(est$gridx),n=length(est$gridx),bw=est$bw,adjust=est$adjust)$y oracleD <- density(Sim$x,from=min(est$gridx),to=max(est$gridx),n=length(est$gridx),bw=est$bw,adjust=est$adjust)$y } trueD <- do.call(paste("d",distribution,sep=""), list(est$grid-offset, ...)) return(list(est=est,Sim=Sim,naiveD=naiveD,oracleD=oracleD,trueD=trueD)) }) } #' Simulation Summary #' @param sim Simulation object returned from sim.Kernelheaping #' @param coverage probability for computing coverage intervals #' @return list with summary statistics #' @export simSummary.Kernelheaping <- function(sim, coverage=0.9){ RMISE <- sqrt(sapply(sim,function(x) cbind(sum(diff(x$est$grid)[1]*(x$est$meanPostDensity-x$trueD)^2), sum(diff(x$est$grid)[1]*(x$naiveD-x$trueD)^2), sum(diff(x$est$grid)[1]*(x$oracleD-x$trueD)^2)))) RRmeans <- sapply(sim,function(x) colMeans(as.matrix(x$est$resultRR[-c(1:x$est$burnin),],nrow=x$est$samples))) RRlower <- sapply(sim,function(x) apply(x$est$resultRR[-c(1:x$est$burnin),],2,quantile,0.5*(1-coverage))) RRupper <- sapply(sim,function(x) apply(x$est$resultRR[-c(1:x$est$burnin),],2,quantile,1-(0.5*(1-coverage)))) Biasmean <- sapply(sim,function(x) mean(x$est$resultBias[-c(1:x$est$burnin),])) Biaslower <- sapply(sim,function(x) quantile(x$est$resultBias[-c(1:x$est$burnin),],0.5*(1-coverage))) Biasupper <- sapply(sim,function(x) quantile(x$est$resultBias[-c(1:x$est$burnin),],1-(0.5*(1-coverage)))) Betamean <- sapply(sim,function(x) mean(x$est$resultBeta[-c(1:x$est$burnin),])) Betalower <- sapply(sim,function(x) quantile(x$est$resultBeta[-c(1:x$est$burnin),],0.5*(1-coverage))) Betaupper <- sapply(sim,function(x) quantile(x$est$resultBeta[-c(1:x$est$burnin),],1-(0.5*(1-coverage)))) #coverage coverageRR <- rowSums(sapply(1:length(sim), function(x) sim[[x]]$Sim$thresholds > RRlower[,x] & sim[[x]]$Sim$thresholds < RRupper[,x]))/length(sim) coverageBias <- sum(sapply(1:length(sim), function(x) sim[[x]]$Sim$downbias>Biaslower[x] & sim[[x]]$Sim$downbiasBetalower[x] & sim[[x]]$Sim$Beta=rounds*0.5)*(1-downbias)*rprobsRR2(RR=RR0,beta=Beta,gridx=x)) down=lapply(down, function(x) x/sum(x)) roundings=sapply(down,function(z) sample(rounds,size=1,replace=TRUE,prob=z)) xheaped=plyr::round_any(xtrue,accuracy=roundings, f=round) return(list(xheaped=xheaped, rounds=rounds, thresholds=thresholds, downbias=downbias, Beta=Beta, x=xtrue)) } #' Bivariate kernel density estimation for rounded data #' @param xrounded rounded values from which to estimate bivariate density, matrix with 2 columns (x,y) #' @param roundvalue rounding value (side length of square in that the true value lies around the rounded one) #' @param burnin burn-in sample size #' @param samples sampling iteration size #' @param adaptive set to TRUE for adaptive bandwidth #' @param gridsize number of evaluation grid points #' @return #' The function returns a list object with the following objects (besides all input objects): #' \item{\code{Mestimates}}{kde object containing the corrected density estimate} #' \item{\code{gridx}}{Vector Grid on which density is evaluated (x)} #' \item{\code{gridy}}{Vector Grid on which density is evaluated (y)} #' \item{\code{resultDensity}}{Array with Estimated Density for each iteration} #' \item{\code{resultX}}{Matrix of true latent values X estimates} #' \item{\code{delaigle}}{Matrix of Delaigle estimator estimates} #' @examples #' # Create Mu and Sigma ----------------------------------------------------------- #' mu1 <- c(0, 0) #' mu2 <- c(5, 3) #' mu3 <- c(-4, 1) #' Sigma1 <- matrix(c(4, 3, 3, 4), 2, 2) #' Sigma2 <- matrix(c(3, 0.5, 0.5, 1), 2, 2) #' Sigma3 <- matrix(c(5, 4, 4, 6), 2, 2) #' # Mixed Normal Distribution ------------------------------------------------------- #' mus <- rbind(mu1, mu2, mu3) #' Sigmas <- rbind(Sigma1, Sigma2, Sigma3) #' props <- c(1/3, 1/3, 1/3) #' \dontrun{xtrue=rmvnorm.mixt(n=1000, mus=mus, Sigmas=Sigmas, props=props) #' roundvalue=2 #' xrounded=plyr::round_any(xtrue,roundvalue) #' est <- dbivr(xrounded,roundvalue=roundvalue,burnin=5,samples=10) #' #' #Plot corrected and Naive distribution #' plot(est,trueX=xtrue) #' #for comparison: plot true density #' dens=dmvnorm.mixt(x=expand.grid(est$Mestimates$eval.points[[1]],est$Mestimates$eval.points[[2]]), #' mus=mus, Sigmas=Sigmas, props=props) #' dens=matrix(dens,nrow=length(est$gridx),ncol=length(est$gridy)) #' contour(dens,x=est$Mestimates$eval.points[[1]],y=est$Mestimates$eval.points[[2]], #' xlim=c(min(est$gridx),max(est$gridx)),ylim=c(min(est$gridy),max(est$gridy)),main="True Density")} #'@export dbivr <- function(xrounded, roundvalue, burnin=2, samples=5, adaptive=FALSE, gridsize=200){ #Create grid - sparr package requires identical grid length on both axis: gridx=seq(min(xrounded[,1])-0.5*roundvalue,max(xrounded[,1])+0.5*roundvalue,length=gridsize) gridy=seq(min(xrounded[,2])-0.5*roundvalue,max(xrounded[,2])+0.5*roundvalue,length=gridsize) #Pilot Estimation Mestimates <- ks::kde(x=xrounded, H=diag(c(roundvalue,roundvalue))^2,gridsize=c(length(gridx),length(gridy)), xmin=c(min(gridx),min(gridy)),xmax=c(max(gridx),max(gridy))) #Result matrices resultDensity=array(dim=c(burnin+samples,length(gridx),length(gridy))) resultX=array(dim=c(samples+burnin,nrow(xrounded),2)) rvalues=unique(xrounded) #unique rounded values in data selectionGrid<-lapply(1:nrow(rvalues),function(k){ selectionX=which(Mestimates$eval.points[[1]]>=rvalues[k,1]-roundvalue*0.5&Mestimates$eval.points[[1]]=rvalues[k,2]-roundvalue*0.5&Mestimates$eval.points[[2]]0])!=0] = sample(c(1,length(roundvalues)+1), length(rguesstemp[xheapedOriginal%%min(roundvalues[roundvalues>0])!=0]),replace=T) rguesstemp[xheapedOriginal%%min(roundvalues[roundvalues>0])==0&rguesstemp %in% c(1,length(roundvalues)+1)] <- rguesstemp[xheapedOriginal%%min(roundvalues[roundvalues>0])==0&rguesstemp %in% c(1,length(roundvalues)+1)]+1 } return(list(new,rguesstemp)) } getNewV <- function(heapedvalues, selectionGrid, Mestimates, Rprs1, Rprs2, xtable, xheaped, gridx, xheapedOriginal, roundvalues, rguesstemp, new, Bias, beta, ranefvalues, RR, recallParams,rounds){ for(i in 1:length(heapedvalues)){ selection=selectionGrid[[i]] selectionprobs=lapply(1:length(selection),function(x) #f(X_i,R_i|.) Rprs1[x,selection[[x]]]/(sum(Rprs1[x,selection[[x]]]))* (Rprs2[x,i])) selectionprobs <- unlist(selectionprobs) selectionprobs[is.na(selectionprobs)] <- 1e-16 temprounds=unlist(lapply(1:length(selection), function(x) rep(x,times=length(selection[[x]])))) sdV <- recallParams[1]*c(roundvalues,roundvalues)[temprounds]+ recallParams[2]*sapply(unlist(selection),function(x) diff(c(0,pnorm(rep(beta*log(abs(gridx[x])),length(RR))+RR),1))%*%roundvalues) selectionprobs2=sapply(new[xheaped==heapedvalues[i]],function(x) dnorm(gridx[unlist(selection)],mean=x-qnorm(pnorm(Bias))*sdV, sd=sdV)) temp=sapply(1:ncol(selectionprobs2), function(j) sample(1:length(selectionprobs),size=1,prob=selectionprobs*selectionprobs2[,j], replace=TRUE)) rguesstemp[xheaped==heapedvalues[i]]=temprounds[temp] new[xheaped==heapedvalues[i]]=gridx[unlist(selection)[temp]] } if(0 %in% roundvalues){ rguesstemp[xheapedOriginal%%min(roundvalues[roundvalues>0])!=0]=sample(c(1,length(roundvalues)+1), length(rguesstemp[xheapedOriginal%%min(roundvalues[roundvalues>0])!=0]),replace=T) rguesstemp[xheapedOriginal%%min(roundvalues[roundvalues>0])==0&rguesstemp %in% c(1,length(roundvalues)+1)] <- rguesstemp[xheapedOriginal%%min(roundvalues[roundvalues>0])==0&rguesstemp %in% c(1,length(roundvalues)+1)]+1 } return(list(new,rguesstemp)) } #' Kernel density estimation for classified data #' @param xclass classified values; matrix with two columns: lower and upper value #' @param burnin burn-in sample size #' @param samples sampling iteration size #' @param boundary TRUE for positive only data (no positive density for negative values) #' @param bw bandwidth selector method, defaults to "nrd0" see \code{density} for more options #' @param evalpoints number of evaluation grid points #' @param adjust as in \code{density}, the user can multiply the bandwidth by a certain factor such that bw=adjust*bw #' @param dFunc character optional density (with "d", "p" and "q" functions) function name for parametric estimation such as "norm" "gamma" or "lnorm" #' @return #' The function returns a list object with the following objects (besides all input objects): #' \item{\code{Mestimates}}{kde object containing the corrected density estimate} #' \item{\code{gridx}}{Vector Grid on which density is evaluated} #' \item{\code{resultDensity}}{Matrix with Estimated Density for each iteration} #' \item{\code{resultX}}{Matrix of true latent values X estimates} #' @examples #' x=rlnorm(500, meanlog = 8, sdlog = 1) #' classes <- c(0,500,1000,1500,2000,2500,3000,4000,5000,6000,8000,10000,15000,Inf) #' xclass <- cut(x,breaks=classes) #' xclass <- cbind(classes[as.numeric(xclass)], classes[as.numeric(xclass) + 1]) #' densityEst <- dclass(xclass=xclass, burnin=20, samples=50, evalpoints=1000) #' plot(densityEst$Mestimates~densityEst$gridx ,lwd=2, type = "l") #' @export dclass <- function(xclass, burnin=2, samples=5, boundary=FALSE, bw="nrd0", evalpoints=200, adjust=1, dFunc = NULL){ if(max(xclass)==Inf){xclass[xclass == Inf]=3*max(xclass[xclass != Inf])} classmeans <- apply(xclass, 1, mean) #Create grid - sparr package requires identical grid length on both axis: gridx=seq(min(xclass),max(xclass),length=evalpoints) #Pilot Estimation if(is.null(dFunc)){ if(boundary==FALSE){Mestimates <- density(classmeans,from=min(gridx),to=max(gridx),n=length(gridx),bw=2*max(xclass) / 10)$y} if(boundary==TRUE){Mestimates <- dbc(gridx=gridx,x=classmeans,bw=2*max(xclass) / 10)} } else { if(dFunc != "gb2"){ pars <- fitdist(classmeans, dFunc, method = "mme")$estimate } else { pars <- mlfit.gb2(classmeans)[[2]]$par } args <- vector(mode = "list", length = 1 + length(pars)) args[[1]] <- gridx args[2:(1+length(pars))] <- pars[1:length(pars)] Mestimates <- do.call(paste0("d", dFunc), args) Mestimates[is.na(Mestimates)] <- 0 Mestimates[Mestimates == Inf] <- max(Mestimates[Mestimates != Inf]) } #Result matrices resultDensity=matrix(ncol=c(burnin+samples),nrow=length(gridx)) resultX=matrix(ncol=c(burnin+samples),nrow=length(xclass)) if(!is.null(dFunc)){ parsOut <- matrix(ncol=c(burnin+samples),nrow=length(pars)) } selectionGrid<-lapply(1:nrow(xclass),function(k){ selection=which(gridx>=xclass[k, 1]&gridx% unlist }) selectionGridHole <- lapply(1:nrow(data),function(i){ lapply(1:length(shapefile@polygons[[i]]@Polygons), function(j){ if(shapefile@polygons[[i]]@Polygons[[j]]@hole == TRUE){ which(point.in.polygon(grid[,1],grid[,2], shapefile@polygons[[i]]@Polygons[[j]]@coords[,1], shapefile@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } }) %>% unlist }) selectionGrid <- lapply(1:length(selectionGrid), function(x) setdiff(selectionGrid[[x]],selectionGridHole[[x]])) outside <- grid[-unlist(selectionGrid), ] unselectionGrid = NULL unselectionGridHoles = NULL if(!is.null(deleteShapes)){ unselectionGrid <- lapply(1:length(deleteShapes@polygons),function(i){ lapply(1:length(deleteShapes@polygons[[i]]@Polygons), function(j){ if(deleteShapes@polygons[[i]]@Polygons[[j]]@hole == FALSE){ which(point.in.polygon(grid[,1],grid[,2], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,1], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } } ) %>% unlist }) %>% unlist unselectionGridHoles <- lapply(1:length(deleteShapes@polygons),function(i){ lapply(1:length(deleteShapes@polygons[[i]]@Polygons), function(j){ if(deleteShapes@polygons[[i]]@Polygons[[j]]@hole == TRUE){ which(point.in.polygon(grid[,1],grid[,2], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,1], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } } ) %>% unlist }) %>% unlist unselectionGrid <- setdiff(unselectionGrid, unselectionGridHoles) outside <- grid[-setdiff(unlist(selectionGrid), unselectionGrid), ] selectionGrid <- lapply(selectionGrid, function(x) setdiff(x,unselectionGrid)) } inside <- grid[unlist(selectionGrid), ] #if no match in grid for shape -> get nearest match (possible reason grid too small..) for(j in which(sapply(selectionGrid,length)==0)){ selectionGrid[[j]] <- which.min(colSums((t(grid) - as.numeric(data[j,c(1,2)]))^2)) } # decide wether to use threads or not numCoresToUse = min(numThreads, numChains, detectCores()) if (numCoresToUse == 1) { for(c in seq(1,numChains)) { res <- dshapebivr_calcChain(c,NA,Mestimates,burnin,samples,grid,gridx,gridy, selectionGrid,shapefile,npoints,adaptive, boundary,fastWeights,data,inside,outside, deleteShapes,unselectionGrid) resultDensity[c,,,] <- res$resultDensity resultX[c,,,] <- res$resultX Mestimates <- res$Mestimates } } else { cl <- makeCluster(numCoresToUse, type="PSOCK", outfile="") # export necessary functions and libraries clusterEvalQ(cl, { library(ks); library(mvtnorm); library(dplyr); library(fastmatch) }) # start parallel execution baseFilename = paste(tempdir(), "/", runif(1, min=1000000, max=9999999), "_", sep="") parLapply(cl, 1:numChains, dshapebivr_calcChain, baseFilename, Mestimates,burnin,samples,grid,gridx,gridy, selectionGrid,shapefile,npoints,adaptive,boundary, fastWeights,data,inside,outside,deleteShapes,unselectionGrid) stopCluster(cl) gc() # copy results to output data structures for(c in seq(1,numChains)) { ret <- NULL load(paste(baseFilename, c, sep = "")) resultDensity[c,,,] <- ret$resultDensity if (isOutputlevelHigh()) { resultX[c,,,] <- ret$resultX } Mestimates <- ret$Mestimates # resultDensity[c,,,] <- clusterResults[[c]]$resultDensity # resultX[c,,,] <- clusterResults[[c]]$resultX # Mestimates <- clusterResults[[c]]$Mestimates rm(ret); gc() } } indexGridColumns <- c( length( dim(resultDensity[,-c(1:burnin),,]) ) -1, length( dim(resultDensity[,-c(1:burnin),,]) )) Mestimates$estimate=apply(resultDensity[,-c(1:burnin),,],indexGridColumns,mean) est<-list(Mestimates=Mestimates,resultDensity=resultDensity, data=data, gridx=gridx, gridy=gridy, shapefile=shapefile, burnin=burnin, samples=samples, adaptive=adaptive) if (isOutputlevelHigh()) { est[["resultX"]]=resultX } class(est) <- "bivshape" return(est) } dshapebivr_calcChain <- function(chain, saveAsBaseFileName, Mestimates, burnin, samples, grid, gridx, gridy, selectionGrid, shapefile, npoints, adaptive, boundary, fastWeights, data, inside, outside, deleteShapes, unselectionGrid) { printInfo("Start Chain ", chain) # return object, if saveAsBaseFileName is != NA it will be saved as RData File instead ret = list() ret$resultDensity=array(dim=c(burnin+samples,length(gridx),length(gridy))) ret$resultX=array(dim=c(samples+burnin,sum(npoints),2)) #pre calculations for faster match usage (fmatch) Max_2SingleID = max(grid) inside_2SingleID = apply(inside, 1, function(x) { x[1]*Max_2SingleID + x[2] } ) #SEM Estimator for(j in 1:(burnin+samples)){ new=matrix(nrow=sum(npoints), ncol=2) newCnt=1 printTiming("Calc Probabilities", { for(i in 1:length(selectionGrid)){ probs=Mestimates$estimate[cbind(fmatch(grid[selectionGrid[[i]],1], Mestimates$eval.points[[1]]), fmatch(grid[selectionGrid[[i]],2], Mestimates$eval.points[[2]]))]+1E-10 points=matrix(ncol=2,grid[selectionGrid[[i]],]) if(length(selectionGrid[[i]])==0){ points <- matrix(ncol=2,shapefile@polygons[[i]]@labpt) probs <- 1 } if(npoints[i] > 0){ if(npoints[i] == 1){ sampleProp <- 1 } else { sampleProp = sample(1:nrow(points),size=max(0,npoints[i],na.rm=T),replace=T,prob=probs) } new[newCnt:(newCnt+npoints[i]-1), ] = points[sampleProp,] newCnt = newCnt+npoints[i] } } }) #recompute H printTiming("Recompute H", { if(adaptive==FALSE){ H <- ks::Hpi(x = new, binned=TRUE) } if(adaptive==TRUE){ H <- ks::Hpi(x = new, binned=TRUE) H <- sqrt(sqrt(H[1,1]*H[2,2])) } }) w = rep(1, nrow(new)) if(boundary == TRUE){ printTiming("Calc Weights", { if(fastWeights == FALSE || j <= ceiling(0.1*(burnin+samples))){ weights <- calcWeights_fast( inside = inside , outside = outside, gridx = gridx , gridy = gridy , H = H ) } }) printTiming("Match Weights", { new_2SingleID = apply(new, 1, function(x) { x[1]*Max_2SingleID + x[2] } ) w <- weights[ fmatch(new_2SingleID, inside_2SingleID, nomatch = 1) ] }) } #recompute density if(adaptive==FALSE){ Mestimates <- ks::kde(x=new, H=H,gridsize=c(length(gridx),length(gridy)), bgridsize=c(length(gridx),length(gridy)), xmin=c(min(gridx),min(gridy)),xmax=c(max(gridx), max(gridy)), binned=TRUE, w = w/mean(w)) } if(adaptive==TRUE){ counts <- plyr::count(new) MestimatesAd <- sparr::bivariate.density(data=counts[,c(1:2)],pilotH=H, res=length(gridx),xrange=range(gridx), yrange=range(gridy),adaptive=TRUE, comment=FALSE, counts=counts[,3]) Mestimates$estimate=MestimatesAd$Zm } if(!is.null(deleteShapes)){ Mestimates$estimate[-setdiff(unlist(selectionGrid), unselectionGrid)] = 0 } else{ Mestimates$estimate[-(unlist(selectionGrid))] = 0 } Mestimates$estimate[is.na(Mestimates$estimate)]=1E-96 ret$resultDensity[j,,] <- Mestimates$estimate ret$resultX[j,,] <- new printInfo("Iteration: ",j," of ", burnin+samples," in chain ", chain) } ret$Mestimates = Mestimates if (is.na(saveAsBaseFileName)) { return( ret ) } else { save(ret, file = paste(saveAsBaseFileName, chain, sep=""), envir = environment()) return( NA ) } } calcWeights_fast <- function(inside, outside, gridx, gridy, H) { # idea: Create grid/image twice as the size of normal grid and calculate # densisty one time with the mean in the middle of the grid. # Then use only indizes-calculation to get the elements which # needs to be summarized. # To get the 1D array position in the 2D image the calculation # 1D_index = y_index * imageSize_x + x_index # is used (lowest index is imageSize_x+1, because R starts with index 1). sizeX <- length(gridx) sizeY <- length(gridy) sizeX2 <- sizeX * 2 sizeY2 <- sizeY * 2 # get rastersize rX <- gridx[2] - gridx[1] rY <- gridy[2] - gridy[1] # create grid with doubled size and same rastersize as orgiginal grid doubleGridInd <- as.matrix(expand.grid( seq(1,sizeX2), seq(1,sizeY2) )) doubleGrid <- cbind( doubleGridInd[,1] * rX, doubleGridInd[,2] *rY ) # calculate densisty only one time on the double grid with mean at (sizeX * rX, sizeY * rY) pilot <- dmvnorm(x = doubleGrid, mean = c(sizeX * rX, sizeY * rY), sigma = H) pilot_sum <- sum(pilot) # store results of density in an array which is accessible with 1D index pilot_Ind1D <- rep(0, sizeX2*(sizeY2+1)) pilot_Ind1D[ (doubleGridInd[,2]) * sizeX2 + doubleGridInd[,1] ] = pilot # get 2D indizes for inside and outside grid points insideInd <- cbind( fmatch( inside [,1] , gridx ), fmatch( inside [,2] , gridy ) ) outsideInd <- cbind( fmatch( outside[,1] , gridx ), fmatch( outside[,2] , gridy ) ) # even faster but not readable term1 <- sizeX2 * ( insideInd[,2] + sizeY ) + sizeX + insideInd[,1] term2 <- insideInd[,1] + insideInd[,2] * sizeX2 weights <- sapply(1:nrow(inside), function(x){ ind <- term1 - term2[x] pilot_sum / sum( pilot_Ind1D[ ind ]) }) return(weights) } #' Bivariate Kernel density estimation for data classified in polygons or shapes #' @param data data.frame with 4 columns: x-coordinate, y-coordinate (i.e. center of polygon) and number of observations in area for partial population and number of observations for complete observations. #' @param burnin burn-in sample size #' @param samples sampling iteration size #' @param adaptive TRUE for adaptive kernel density estimation #' @param shapefile shapefile with number of polygons equal to nrow(data) #' @param gridsize number of evaluation grid points #' @param boundary boundary corrected kernel density estimate? #' @param fastWeights if TRUE weigths for boundary estimation are only computed for first 10 percent of samples to speed up computation #' @param deleteShapes shapefile containing areas without observations #' @param numChains number of chains of SEM algorithm #' @param numThreads number of threads to be used (only applicable if more than one chains) #' @examples #' \dontrun{ #' library(maptools) #' #' # Read Shapefile of Berlin Urban Planning Areas (download available from: #' https://www.statistik-berlin-brandenburg.de/opendata/RBS_OD_LOR_2015_12.zip) #' Berlin <- rgdal::readOGR("X:/SomeDir/RBS_OD_LOR_2015_12.shp") #(von daten.berlin.de) #' #' # Get Dataset of Berlin Population (download available from: #' # https://www.statistik-berlin-brandenburg.de/opendata/EWR201512E_Matrix.csv) #' data <- read.csv2("X:/SomeDir/EWR201512E_Matrix.csv") #' #' # Form Dataset for Estimation Process #' dataIn <- cbind(t(sapply(1:length(Berlin@polygons), #' function(x) Berlin@polygons[[x]]@labpt)), data$E_E65U80, data$E_E) #' #' #Estimate Bivariate Proportions (may take some minutes) #' PropEst <- dshapebivrProp(data = dataIn, burnin = 5, samples = 20, adaptive = FALSE, #' shapefile = Berlin, gridsize=325, numChains = 16, numThreads = 4)} #' #' # Plot Proportions over Area: #' \dontrun{ #' breaks <- seq(0,0.4,by=0.025) #' image.plot(x=PropEst$Mestimates$eval.points[[1]],y=PropEst$Mestimates$eval.points[[2]], #' z=PropEst$proportion+1E-96, asp=1, breaks = breaks, #' col = colorRampPalette(brewer.pal(9,"YlOrRd"))(length(breaks)-1)) #' plot(Berlin, add=TRUE)} #' @export dshapebivrProp <- function(data, burnin=2, samples=5, adaptive=FALSE, shapefile, gridsize=200, boundary = FALSE, deleteShapes = NULL, fastWeights = TRUE, numChains=1, numThreads=1){ ########################################################### ##### get polygon shape coordinates from data ##### ########################################################### pol.x <- list() pol.y <- list() for (i in 1:length(shapefile@polygons)) { pol.x[[i]] <- shapefile@polygons[[i]]@Polygons[[1]]@coords[,1] pol.y[[i]] <- shapefile@polygons[[i]]@Polygons[[1]]@coords[,2] } npointsAll <- data[ ,4] #delete duplicated points npoints <- data[ ,3] #delete duplicated points if (length(gridsize) == 1) { gridsize = c(gridsize, gridsize) } #Create grid - sparr package requires identical grid length on both axis: gridx=seq(shapefile@bbox[1,1],shapefile@bbox[1,2],length=gridsize[1]) gridy=seq(shapefile@bbox[2,1],shapefile@bbox[2,2],length=gridsize[2]) grid <- as.matrix(expand.grid(gridx,gridy)) #Pilot Estimation MestimatesAll <- ks::kde(x=data[,c(1,2)], H = 10 * diag(c(diff(shapefile@bbox[1,])/sqrt(length(shapefile@polygons)), c(diff(shapefile@bbox[2,])/sqrt(length(shapefile@polygons)))))^2, gridsize=c(length(gridx),length(gridy)), xmin=c(min(gridx),min(gridy)), xmax=c(max(gridx),max(gridy)),w=nrow(data)*data[,4]/sum(data[,4])) Mestimates <- ks::kde(x=data[,c(1,2)], H = 10 * diag(c(diff(shapefile@bbox[1,])/sqrt(length(shapefile@polygons)), c(diff(shapefile@bbox[2,])/sqrt(length(shapefile@polygons)))))^2, gridsize=c(length(gridx),length(gridy)), xmin=c(min(gridx),min(gridy)), xmax=c(max(gridx),max(gridy)),w=nrow(data)*data[,3]/sum(data[,3])) #Result matrices resultDensity=array(dim=c(numChains, burnin+samples,length(gridx),length(gridy))) resultX=array(dim=c(numChains,samples+burnin,sum(npoints),2)) proportionArray=array(dim=c(numChains,burnin+samples,length(gridx),length(gridy))) printTiming("Calc selectionGrid", { selectionGrid <- lapply(1:nrow(data),function(i){ lapply(1:length(shapefile@polygons[[i]]@Polygons), function(j){ if(shapefile@polygons[[i]]@Polygons[[j]]@hole == FALSE){ which(point.in.polygon(grid[,1],grid[,2],shapefile@polygons[[i]]@Polygons[[j]]@coords[,1], shapefile@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } }) %>% unlist }) }) printTiming("Calc selectionGridHole", { selectionGridHole <- lapply(1:nrow(data),function(i){ lapply(1:length(shapefile@polygons[[i]]@Polygons), function(j){ if(shapefile@polygons[[i]]@Polygons[[j]]@hole == TRUE){ which(point.in.polygon(grid[,1],grid[,2], shapefile@polygons[[i]]@Polygons[[j]]@coords[,1], shapefile@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } }) %>% unlist }) }) printTiming("Recalc selectionGrid", { selectionGrid <- lapply(1:length(selectionGrid), function(x) setdiff(selectionGrid[[x]],selectionGridHole[[x]])) outside <- grid[-unlist(selectionGrid), ] }) printTiming("Consider DeleteShapes", { unselectionGrid = NULL unselectionGridHoles = NULL if(!is.null(deleteShapes)){ unselectionGrid <- lapply(1:length(deleteShapes@polygons),function(i){ lapply(1:length(deleteShapes@polygons[[i]]@Polygons), function(j){ if(deleteShapes@polygons[[i]]@Polygons[[j]]@hole == FALSE){ which(point.in.polygon(grid[,1],grid[,2], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,1], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } } ) %>% unlist }) %>% unlist unselectionGridHoles <- lapply(1:length(deleteShapes@polygons),function(i){ lapply(1:length(deleteShapes@polygons[[i]]@Polygons), function(j){ if(deleteShapes@polygons[[i]]@Polygons[[j]]@hole == TRUE){ which(point.in.polygon(grid[,1],grid[,2], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,1], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } } ) %>% unlist }) %>% unlist unselectionGrid <- setdiff(unselectionGrid, unselectionGridHoles) outside <- grid[-setdiff(unlist(selectionGrid), unselectionGrid), ] selectionGrid <- lapply(selectionGrid, function(x) setdiff(x,unselectionGrid)) } }) inside <- grid[unlist(selectionGrid), ] #SEM Estimator #if no match in grid for shape -> get nearest match (possible reason grid too small..) for(j in which(sapply(selectionGrid,length)==0)){ selectionGrid[[j]] <- which.min(colSums((t(grid) - as.numeric(data[j,c(1,2)]))^2)) } rm(selectionGridHole, pol.x, pol.y); gc() # decide wether to use threads or not numCoresToUse = min(numThreads, numChains, detectCores()) if (numCoresToUse == 1) { for(c in seq(1,numChains)) { res <- dshapebivrProp_calcChain(c,NA,MestimatesAll,Mestimates,burnin, samples,grid,gridx,gridy,selectionGrid, shapefile,npointsAll, npoints,adaptive,boundary,fastWeights, data,inside,outside,deleteShapes, unselectionGrid) resultDensity[c,,,] <- res$resultDensity if (isOutputlevelHigh()) { resultX[c,,,] <- res$resultX } proportionArray[c,,,] <- res$proportionArray Mestimates <- res$Mestimates rm(res); gc() } } else { cl <- makeCluster(numCoresToUse , type="PSOCK", outfile="") # export necessary functions and libraries clusterEvalQ(cl, { library(ks); library(mvtnorm); library(dplyr); library(fastmatch); library(Kernelheaping) }) # generate random filename which is used for result files for each thread baseFilename = paste(tempdir(), "/", runif(1, min=1000000, max=9999999), "_", sep="") # start parallel execution parLapply(cl, 1:numChains, dshapebivrProp_calcChain, baseFilename, MestimatesAll, Mestimates, burnin, samples, grid, gridx, gridy, selectionGrid, shapefile, npointsAll, npoints, adaptive,boundary,fastWeights, data, inside, outside, deleteShapes, unselectionGrid) stopCluster(cl) rm(cl) gc() # copy results to output data structures for(c in seq(1,numChains)) { ret <- NULL load(paste(baseFilename, c, sep = "")) resultDensity[c,,,] <- ret$resultDensity if (isOutputlevelHigh()) { resultX[c,,,] <- ret$resultX } proportionArray[c,,,] <- ret$proportionArray Mestimates <- ret$Mestimates # resultDensity[c,,,] <- clusterResults[[c]]$resultDensity # resultX[c,,,] <- clusterResults[[c]]$resultX # proportionArray[c,,,] <- clusterResults[[c]]$proportionArray # Mestimates <- clusterResults[[c]]$Mestimates rm(ret); gc() } } indexGridColumns <- c( length( dim(resultDensity[,-c(1:burnin),,]) ) -1, length( dim(resultDensity[,-c(1:burnin),,]) )) Mestimates$estimate=apply(resultDensity[,-c(1:burnin),,],indexGridColumns,mean) est<-list(Mestimates=Mestimates,resultDensity=resultDensity, data=data, gridx=gridx, gridy=gridy, shapefile=shapefile, burnin=burnin, samples=samples, adaptive=adaptive, numChains=numChains, proportions = apply(proportionArray[,-c(1:burnin),,],indexGridColumns,mean)) if (isOutputlevelHigh()) { est[["resultX"]]=resultX } class(est) <- "bivshape" return(est) } dshapebivrProp_calcChain <- function(chain, saveAsBaseFileName, MestimatesAll, Mestimates, burnin, samples, grid, gridx, gridy, selectionGrid, shapefile, npointsAll, npoints, adaptive, boundary, fastWeights, data, inside, outside, deleteShapes, unselectionGrid) { printInfo("Start Chain ", chain) # return object; if saveAsBaseFileName is != NA it will be saved as RData File instead ret = list() ret$resultDensity=array(dim=c(burnin+samples,length(gridx),length(gridy))) ret$resultX=array(dim=c(samples+burnin,sum(npoints),2)) ret$proportionArray=array(dim=c(burnin+samples,length(gridx),length(gridy))) #pre calculations for faster match usage (fmatch) Max_2SingleID = max(grid) inside_2SingleID = apply(inside, 1, function(x) { x[1]*Max_2SingleID + x[2] } ) for(j in 1:(burnin+samples)){ newAll=matrix(nrow=sum(npointsAll), ncol=2) newAllCnt=1 new=matrix(nrow=sum(npoints), ncol=2) newCnt=1 print(j) printTiming("Calc Probabilities", { for(i in 1:length(selectionGrid)){ probsAll=MestimatesAll$estimate[cbind(match(grid[selectionGrid[[i]],1], MestimatesAll$eval.points[[1]]), match(grid[selectionGrid[[i]],2], MestimatesAll$eval.points[[2]]))]+1E-16 probs=Mestimates$estimate[cbind(match(grid[selectionGrid[[i]],1], Mestimates$eval.points[[1]]), match(grid[selectionGrid[[i]],2], Mestimates$eval.points[[2]]))]+1E-16 probsAll = probsAll/sum(probsAll) probs = probs/sum(probs) probsPROP=(probs/probsAll)*(sum(data[i,3])/sum(data[i,4])) points=matrix(ncol=2,grid[selectionGrid[[i]],]) if(length(selectionGrid[[i]])==0){points <- matrix(ncol=2,shapefile@polygons[[i]]@labpt) probs=1 probsAll=1} if(npointsAll[i]>0){ sampleAll=sample(1:nrow(points),size=max(0,npointsAll[i],na.rm=T),replace=T,prob=probsAll) newAll[newAllCnt:(newAllCnt+npointsAll[i]-1), ] = points[sampleAll,] newAllCnt = newAllCnt+npointsAll[i] } if(npoints[i]>0){ if (length(sampleAll) == 1) { sampleProp = sampleAll } else { sampleProp <- sample(sampleAll, size = max(0, npoints[i], na.rm = T), prob = probsPROP[sampleAll]) } new[newCnt:(newCnt+npoints[i]-1), ] = points[sampleProp,] newCnt = newCnt+npoints[i] } } }) #recompute H printTiming("Recompute H", { if(adaptive==FALSE){ H <- ks::Hpi(x = new, binned=TRUE) * 2 } if(adaptive==TRUE){ H <- ks::Hpi(x = new, binned=TRUE) H <- sqrt(sqrt(H[1,1]*H[2,2])) } }) wAll = rep(1, nrow(newAll)) w = rep(1, nrow(new)) if(boundary == TRUE){ printTiming("Calc Weights", { if(fastWeights == FALSE || j <= ceiling(0.1*(burnin+samples))){ weights <- calcWeights_fast( inside = inside , outside = outside, gridx = gridx , gridy = gridy , H = H ) } }) printTiming("Match Weights", { new_2SingleID = apply(new, 1, function(x) { x[1]*Max_2SingleID + x[2] } ) newAll_2SingleID = apply(newAll, 1, function(x) { x[1]*Max_2SingleID + x[2] } ) w <- weights[ fmatch(new_2SingleID, inside_2SingleID, nomatch = 1) ] wAll <- weights[ fmatch(newAll_2SingleID, inside_2SingleID, nomatch = 1) ] }) } #recompute density printTiming("Recompute Density", { if(adaptive==FALSE){ MestimatesAll <- ks::kde(x=newAll, H=H,gridsize=c(length(gridx),length(gridy)),bgridsize=c(length(gridx),length(gridy)), xmin=c(min(gridx),min(gridy)),xmax=c(max(gridx),max(gridy)),binned=TRUE, w = wAll/mean(wAll)) Mestimates <- ks::kde(x=new, H=H,gridsize=c(length(gridx),length(gridy)),bgridsize=c(length(gridx),length(gridy)), xmin=c(min(gridx),min(gridy)),xmax=c(max(gridx),max(gridy)),binned=TRUE, w = w/mean(w)) } if(adaptive==TRUE){ counts <- plyr::count(new) MestimatesAd <- sparr::bivariate.density(data=counts[,c(1:2)],pilotH=H,res=length(gridx),xrange=range(gridx), yrange=range(gridy),adaptive=TRUE, comment=FALSE, counts=counts[,3]) Mestimates$estimate=MestimatesAd$Zm } #weird behaviour of ks::kde returning negative densities (probably due to numeric problems) Mestimates$estimate[Mestimates$estimate < 0] <- 0 MestimatesAll$estimate[MestimatesAll$estimate < 0] <- 0 }) printTiming("Delete Shapes", { if(!is.null(deleteShapes)){ Mestimates$estimate[-setdiff(unlist(selectionGrid), unselectionGrid)] = 0 MestimatesAll$estimate[-setdiff(unlist(selectionGrid), unselectionGrid)] = 0 } else{ Mestimates$estimate[-(unlist(selectionGrid))] = 0 MestimatesAll$estimate[-(unlist(selectionGrid))] = 0 } }) printTiming("Match Density", { densityAll <- matrix(NA,ncol=ncol(Mestimates$estimate), nrow=nrow(Mestimates$estimate)) densityPart <- matrix(NA,ncol=ncol(Mestimates$estimate), nrow=nrow(Mestimates$estimate)) densityAll[cbind(match(grid[selectionGrid %>% unlist ,1],MestimatesAll$eval.points[[1]]), match(grid[selectionGrid %>% unlist,2],MestimatesAll$eval.points[[2]]))] = MestimatesAll$estimate[cbind(match(grid[selectionGrid %>% unlist ,1],MestimatesAll$eval.points[[1]]), match(grid[selectionGrid %>% unlist,2],MestimatesAll$eval.points[[2]]))] densityPart[cbind(match(grid[selectionGrid %>% unlist ,1],MestimatesAll$eval.points[[1]]), match(grid[selectionGrid %>% unlist,2],MestimatesAll$eval.points[[2]]))] = Mestimates$estimate[cbind(match(grid[selectionGrid %>% unlist ,1],MestimatesAll$eval.points[[1]]), match(grid[selectionGrid %>% unlist,2],MestimatesAll$eval.points[[2]]))] }) printTiming("Assignments", { densityPart <- densityPart/sum(densityPart, na.rm=TRUE) densityAll <- densityAll/sum(densityAll, na.rm=TRUE) densityAll <- densityAll+1E-96 proportion <- (densityPart/densityAll) * (sum(data[,3])/sum(data[,4])) proportion[proportion<0]=0 proportion[proportion>1]=1 Mestimates$estimate[is.na(Mestimates$estimate)]=1E-96 ret$resultDensity[j,,] <- Mestimates$estimate ret$proportionArray[j,,] <- proportion ret$resultX[j,,] <- new }) printInfo("Iteration: ",j," of ", burnin+samples," in chain ", chain) } ret$Mestimates = Mestimates if (is.na(saveAsBaseFileName)) { return( ret ) } else { save(ret, file = paste(saveAsBaseFileName, chain, sep=""), envir = environment()) return( NA ) } } #' 3d Kernel density estimation for data classified in polygons or shapes #' @param data data.frame with 5 columns: x-coordinate, y-coordinate (i.e. center of polygon) and number of observations in area for partial population and number of observations for complete observations and third variable (numeric). #' @param burnin burn-in sample size #' @param samples sampling iteration size #' @param shapefile shapefile with number of polygons equal to nrow(data) / length(unique(data[,5])) #' @param gridsize number of evaluation grid points #' @param boundary boundary corrected kernel density estimate? #' @param fastWeights if TRUE weigths for boundary estimation are only computed for first 10 percent of samples to speed up computation #' @param deleteShapes shapefile containing areas without observations #' @param numChains number of chains of SEM algorithm #' @param numThreads number of threads to be used (only applicable if more than one chains) #' @export dshape3dProp <- function(data, burnin=2, samples=5, shapefile, gridsize=200, boundary = FALSE, deleteShapes = NULL, fastWeights = TRUE, numChains=1, numThreads=1){ ########################################################### ##### get polygon shape coordinates from data ##### ########################################################### # adaptive not implemented adaptive <- FALSE # order data data <- data[order(data[,5]),] pol.x <- list() pol.y <- list() for (i in 1:length(shapefile@polygons)) { pol.x[[i]] <- shapefile@polygons[[i]]@Polygons[[1]]@coords[,1] pol.y[[i]] <- shapefile@polygons[[i]]@Polygons[[1]]@coords[,2] } npointsAll <- data[ ,c(4,5)] # number of observations for complete observations and third variable npoints <- data[ ,c(3,5)] # number of observations in area for partial population and third variable if (length(gridsize) == 1) { gridsize = c(gridsize, gridsize) } #Create grid - sparr package requires identical grid length on both axis: gridx=seq(shapefile@bbox[1,1],shapefile@bbox[1,2],length=gridsize[1]) gridy=seq(shapefile@bbox[2,1],shapefile@bbox[2,2],length=gridsize[2]) grid <- as.matrix(expand.grid(gridx,gridy)) # calculate weights for for complete observations and each unique observation of third variable dataAllWeight <- sapply(unique(data[,5]), function(x){ d <- data[data[,5]==x,] nrow(d)*d[,4]/sum(d[,4]) }) # calculate weights for for partial population and each unique observation of third variable dataWeight <- sapply(unique(data[,5]), function(x){ d <- data[data[,5]==x,] nrow(d)*d[,3]/sum(d[,3]) }) #Pilot Estimation # 3 dimensional Kernel Density Estimation # added new weight variable and adjusted x, H, gridsize, xmin and xmax for third dimension MestimatesAll <- ks::kde(x=data[,c(1,2,5)],H = 10 * diag(c(diff(shapefile@bbox[1,])/sqrt(length(shapefile@polygons)), c(diff(shapefile@bbox[2,])/sqrt(length(shapefile@polygons))), c(diff(c(min(data[,5]),max(data[,5]))))))^2, gridsize=c(length(gridx),length(gridy),length(unique(data[,5]))), xmin=c(min(gridx),min(gridy),min(data[,5])), xmax=c(max(gridx),max(gridy),max(data[,5])), w=unlist(as.vector(dataAllWeight))) Mestimates <- ks::kde(x=data[,c(1,2,5)],H = 10 * diag(c(diff(shapefile@bbox[1,])/sqrt(length(shapefile@polygons)), c(diff(shapefile@bbox[2,])/sqrt(length(shapefile@polygons))), c(diff(c(min(data[,5]),max(data[,5]))))))^2, gridsize=c(length(gridx),length(gridy),length(unique(data[,5]))), xmin=c(min(gridx),min(gridy),min(data[,5])), xmax=c(max(gridx),max(gridy),max(data[,5])), w=unlist(as.vector(dataWeight))) #Result matrices resultDensity=array(dim=c(numChains, burnin+samples,length(gridx),length(gridy),length(unique(npointsAll[,2])))) resultX=array(dim=c(numChains,samples+burnin,sum(npoints[,1]),3)) proportionArray=array(dim=c(numChains, burnin+samples,length(gridx),length(gridy),length(unique(npointsAll[,2])))) printTiming("Calc selectionGrid", { selectionGrid <- lapply(1:nrow(unique(data[,1:2])),function(i){ lapply(1:length(shapefile@polygons[[i]]@Polygons), function(j){ if(shapefile@polygons[[i]]@Polygons[[j]]@hole == FALSE){ which(point.in.polygon(grid[,1],grid[,2],shapefile@polygons[[i]]@Polygons[[j]]@coords[,1], shapefile@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } }) %>% unlist }) }) printTiming("Calc selectionGridHole", { selectionGridHole <- lapply(1:nrow(unique(data[,1:2])),function(i){ lapply(1:length(shapefile@polygons[[i]]@Polygons), function(j){ if(shapefile@polygons[[i]]@Polygons[[j]]@hole == TRUE){ which(point.in.polygon(grid[,1],grid[,2], shapefile@polygons[[i]]@Polygons[[j]]@coords[,1], shapefile@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } }) %>% unlist }) }) printTiming("Recalc selectionGrid", { selectionGrid <- lapply(1:length(selectionGrid), function(x) setdiff(selectionGrid[[x]],selectionGridHole[[x]])) outside <- grid[-unlist(selectionGrid), ] }) printTiming("Consider DeleteShapes", { unselectionGrid = NULL unselectionGridHoles = NULL if(!is.null(deleteShapes)){ unselectionGrid <- lapply(1:length(deleteShapes@polygons),function(i){ lapply(1:length(deleteShapes@polygons[[i]]@Polygons), function(j){ if(deleteShapes@polygons[[i]]@Polygons[[j]]@hole == FALSE){ which(point.in.polygon(grid[,1],grid[,2], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,1], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } } ) %>% unlist }) %>% unlist unselectionGridHoles <- lapply(1:length(deleteShapes@polygons),function(i){ lapply(1:length(deleteShapes@polygons[[i]]@Polygons), function(j){ if(deleteShapes@polygons[[i]]@Polygons[[j]]@hole == TRUE){ which(point.in.polygon(grid[,1],grid[,2], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,1], deleteShapes@polygons[[i]]@Polygons[[j]]@coords[,2])==1) } } ) %>% unlist }) %>% unlist unselectionGrid <- setdiff(unselectionGrid, unselectionGridHoles) outside <- grid[-setdiff(unlist(selectionGrid), unselectionGrid), ] selectionGrid <- lapply(selectionGrid, function(x) setdiff(x,unselectionGrid)) } }) inside <- grid[unlist(selectionGrid), ] #SEM Estimator #if no match in grid for shape -> get nearest match (possible reason grid too small..) for(j in which(sapply(selectionGrid,length)==0)){ selectionGrid[[j]] <- which.min(colSums((t(grid) - as.numeric(data[j,c(1,2)]))^2)) } rm(selectionGridHole, pol.x, pol.y); gc() # decide wether to use threads or not numCoresToUse = min(numThreads, numChains, detectCores()) if (numCoresToUse == 1) { for(c in seq(1,numChains)) { res <- dshape3dProp_calcChain(c,NA,MestimatesAll,Mestimates,burnin, samples,grid,gridx,gridy,selectionGrid, shapefile,npointsAll, npoints,adaptive,boundary,fastWeights, data,inside,outside,deleteShapes, unselectionGrid) resultDensity[c,,,,] <- res$resultDensity if (isOutputlevelHigh()) { resultX[c,,,] <- res$resultX } proportionArray[c,,,,] <- res$proportionArray Mestimates <- res$Mestimates rm(res); gc() } } else { cl <- makeCluster(numCoresToUse , type="PSOCK", outfile="") # export necessary functions and libraries clusterEvalQ(cl, { library(ks); library(mvtnorm); library(dplyr); library(fastmatch); library(Kernelheaping) }) # generate random filename which is used for result files for each thread baseFilename = paste(tempdir(), "/", runif(1, min=1000000, max=9999999), "_", sep="") # start parallel execution parLapply(cl, 1:numChains, dshape3dProp_calcChain, baseFilename, MestimatesAll, Mestimates, burnin, samples, grid, gridx, gridy, selectionGrid, shapefile, npointsAll, npoints, adaptive,boundary,fastWeights, data, inside, outside, deleteShapes, unselectionGrid) stopCluster(cl) rm(cl) gc() # copy results to output data structures for(c in seq(1,numChains)) { ret <- NULL load(paste(baseFilename, c, sep = "")) resultDensity[c,,,,] <- ret$resultDensity if (isOutputlevelHigh()) { resultX[c,,,] <- ret$resultX } proportionArray[c,,,,] <- ret$proportionArray Mestimates <- ret$Mestimates # resultDensity[c,,,] <- clusterResults[[c]]$resultDensity # resultX[c,,,] <- clusterResults[[c]]$resultX # proportionArray[c,,,] <- clusterResults[[c]]$proportionArray # Mestimates <- clusterResults[[c]]$Mestimates rm(ret); gc() } } indexGridColumns <- c( length( dim(resultDensity[,-c(1:burnin),,,]) ) -2, length( dim(resultDensity[,-c(1:burnin),,,]) ) -1, length( dim(resultDensity[,-c(1:burnin),,,]) )) Mestimates$estimate=apply(resultDensity[,-c(1:burnin),,,],indexGridColumns,mean) est<-list(Mestimates=Mestimates,resultDensity=resultDensity, data=data, gridx=gridx, gridy=gridy, shapefile=shapefile, burnin=burnin, samples=samples, adaptive=adaptive, numChains=numChains, proportions = apply(proportionArray[,-c(1:burnin),,,],indexGridColumns,mean)) if (isOutputlevelHigh()) { est[["resultX"]]=resultX } class(est) <- "3dshape" return(est) } dshape3dProp_calcChain <- function(chain, saveAsBaseFileName, MestimatesAll, Mestimates, burnin, samples, grid, gridx, gridy, selectionGrid, shapefile, npointsAll, npoints, adaptive, boundary, fastWeights, data, inside, outside, deleteShapes, unselectionGrid) { printInfo("Start Chain ", chain) # return object; if saveAsBaseFileName is != NA it will be saved as RData File instead ret = list() ret$resultDensity=array(dim=c(burnin+samples,length(gridx),length(gridy),length(unique(npointsAll[,2])))) ret$resultX=array(dim=c(samples+burnin,sum(npoints[,1]),3)) ret$proportionArray=array(dim=c(burnin+samples,length(gridx),length(gridy),length(unique(npointsAll[,2])))) #pre calculations for faster match usage (fmatch) Max_2SingleID = max(grid) inside_2SingleID = apply(inside, 1, function(x) { x[1]*Max_2SingleID + x[2] } ) for(j in 1:(burnin+samples)){ printInfo("Start Iteration: ",j," of ", burnin+samples," in chain ", chain) newAll=NULL new=NULL # added a new loop: # in dshapebivrProp() sampling is done seperately for each area (i) from the 2d density # in dshapebivrProp3d() sampling is done seperately for each time period (k) and each area (i) from the 3d density for(k in 1:length(unique(npointsAll[,2]))){ print(paste0("... ",k," / ",length(unique(npointsAll[,2])))) # subset data and npoints, keeping only observation k of third variable # do steps similar as in 2d case and later bind together sampled geocoordinates for each time period datak <- data[data[,5]==unique(data[,5])[k],] npointsAllk <- npointsAll[npointsAll[,2]==unique(npointsAll[,2])[k],] npointsk <- npoints[npoints[,2]==unique(npoints[,2])[k],] newAllk=matrix(nrow=sum(npointsAllk[,1]), ncol=3) newAllCnt=1 newk=matrix(nrow=sum(npointsk[,1]), ncol=3) newCnt=1 printTiming("Calc Probabilities", { for(i in 1:length(selectionGrid)){ # probs for observation k of third variable from 3d density probsAll=MestimatesAll$estimate[cbind(match(grid[selectionGrid[[i]],1], MestimatesAll$eval.points[[1]]), match(grid[selectionGrid[[i]],2], MestimatesAll$eval.points[[2]]), match(MestimatesAll$eval.points[[3]][k], MestimatesAll$eval.points[[3]]))]+1E-16 probs=Mestimates$estimate[cbind(match(grid[selectionGrid[[i]],1], Mestimates$eval.points[[1]]), match(grid[selectionGrid[[i]],2], Mestimates$eval.points[[2]]), match(Mestimates$eval.points[[3]][k], Mestimates$eval.points[[3]]))]+1E-16 probsAll = probsAll/sum(probsAll) probs = probs/sum(probs) probsPROP=(probs/probsAll)*(sum(datak[i,3])/sum(datak[i,4])) points=matrix(ncol=2,grid[selectionGrid[[i]],]) points=cbind(points,as.integer(unique(data[,5])[k])) if(length(selectionGrid[[i]])==0){points <- matrix(ncol=2,shapefile@polygons[[i]]@labpt) probs=1 probsAll=1} if(npointsAllk[,1][i]>0){ sampleAll=sample(1:nrow(points),size=max(0,npointsAllk[,1][i],na.rm=T),replace=T,prob=probsAll) newAllk[newAllCnt:(newAllCnt+npointsAllk[,1][i]-1), ] = points[sampleAll,] newAllCnt = newAllCnt+npointsAllk[,1][i] } if(npointsk[,1][i]>0){ if (length(sampleAll) == 1) { sampleProp = sampleAll } else { sampleProp <- sample(sampleAll, size = max(0, npointsk[,1][i], na.rm = T), prob = probsPROP[sampleAll]) } newk[newCnt:(newCnt+npointsk[,1][i]-1), ] = points[sampleProp,] newCnt = newCnt+npointsk[,1][i] } } }) # end of i newAllk <- data.frame(newAllk) %>% dplyr::group_by_all() %>% dplyr::count() %>% dplyr::distinct() %>% as.matrix() # slightly faster but requires dplyr # newAllk <- as.matrix(plyr::ddply(data.frame(newAllk),.(X1,X2,X3),nrow)) # uses plyr newAll <- rbind(newAll,newAllk) new <- rbind(new,newk) } #recompute H printTiming("Recompute H", { if(adaptive==FALSE){ H <- ks::Hpi(x = new, binned=TRUE) * 2 } if(adaptive==TRUE){ H <- ks::Hpi(x = new, binned=TRUE) # adjusted for 3rd dimension H <- sqrt(sqrt(H[1,1]*H[2,2]*H[3,3])) } }) wAll = newAll[,4] w = rep(1, nrow(new)) if(boundary == TRUE){ printTiming("Calc Weights", { if(fastWeights == FALSE || j <= ceiling(0.1*(burnin+samples))){ weights <- calcWeights_fast( inside = inside , outside = outside, gridx = gridx , gridy = gridy , H = H[1:2,1:2]) } }) printTiming("Match Weights", { new_2SingleID = apply(new, 1, function(x) { x[1]*Max_2SingleID + x[2] } ) newAll_2SingleID = apply(newAll, 1, function(x) { x[1]*Max_2SingleID + x[2] } ) w <- weights[ fmatch(new_2SingleID, inside_2SingleID, nomatch = 1) ] wAll <- weights[ fmatch(newAll_2SingleID, inside_2SingleID, nomatch = 1) ] * wAll }) } #recompute density printTiming("Recompute Density", { if(adaptive==FALSE){ # added new weight variable and adjusted x, H, gridsize, xmin and xmax for third dimension MestimatesAll <- kde(newAll[,1:3], H = H, gridsize=c(length(gridx),length(gridy),length(unique(data[,5]))), bgridsize=c(length(gridx),length(gridy),length(unique(data[,5]))), xmin=c(min(gridx),min(gridy),min(unique(data[,5]))), xmax=c(max(gridx),max(gridy),max(unique(data[,5]))), binned=TRUE, w = wAll/mean(wAll)) Mestimates <- ks::kde(x=new, H=H, gridsize=c(length(gridx),length(gridy),length(unique(data[,5]))), bgridsize=c(length(gridx),length(gridy),length(unique(data[,5]))), xmin=c(min(gridx),min(gridy),min(unique(data[,5]))), xmax=c(max(gridx),max(gridy),max(unique(data[,5]))), binned=TRUE, w = w/mean(w)) } if(adaptive==TRUE){ counts <- plyr::count(new) MestimatesAd <- sparr::bivariate.density(data=counts[,c(1:2)],pilotH=H,res=length(gridx),xrange=range(gridx), yrange=range(gridy),adaptive=TRUE, comment=FALSE, counts=counts[,4]) Mestimates$estimate=MestimatesAd$Zm } #weird behaviour of ks::kde returning negative densities (probably due to numeric problems) Mestimates$estimate[Mestimates$estimate < 0] <- 0 MestimatesAll$estimate[MestimatesAll$estimate < 0] <- 0 }) printTiming("Delete Shapes", { if(!is.null(deleteShapes)){ for(k in 1:length(MestimatesAll$eval.points[[3]])){ Mestimates$estimate[,,k][-setdiff(unlist(selectionGrid), unselectionGrid)] = 0 MestimatesAll$estimate[,,k][-setdiff(unlist(selectionGrid), unselectionGrid)] = 0 } } else{ for(k in 1:length(MestimatesAll$eval.points[[3]])){ Mestimates$estimate[,,k][-(unlist(selectionGrid))] = 0 MestimatesAll$estimate[,,k][-(unlist(selectionGrid))] = 0 } } }) printTiming("Match Density", { # adjusted for 3rd dimension densityAll <- array(NA,dim(MestimatesAll$estimate)) densityPart <- array(NA,dim(Mestimates$estimate)) for(k in 1:length(MestimatesAll$eval.points[[3]])){ densityAll[cbind(match(grid[selectionGrid %>% unlist ,1],MestimatesAll$eval.points[[1]]), match(grid[selectionGrid %>% unlist,2],MestimatesAll$eval.points[[2]]), match(MestimatesAll$eval.points[[3]][k],MestimatesAll$eval.points[[3]]))] = MestimatesAll$estimate[cbind(match(grid[selectionGrid %>% unlist ,1],MestimatesAll$eval.points[[1]]), match(grid[selectionGrid %>% unlist,2],MestimatesAll$eval.points[[2]]), match(MestimatesAll$eval.points[[3]][k],MestimatesAll$eval.points[[3]]))] densityPart[cbind(match(grid[selectionGrid %>% unlist ,1],MestimatesAll$eval.points[[1]]), match(grid[selectionGrid %>% unlist,2],MestimatesAll$eval.points[[2]]), match(MestimatesAll$eval.points[[3]][k],MestimatesAll$eval.points[[3]]))] = Mestimates$estimate[cbind(match(grid[selectionGrid %>% unlist ,1],MestimatesAll$eval.points[[1]]), match(grid[selectionGrid %>% unlist,2],MestimatesAll$eval.points[[2]]), match(MestimatesAll$eval.points[[3]][k],MestimatesAll$eval.points[[3]]))] } }) printTiming("Assignments", { # adjusted for 3rd dimension proportion <- array(NA,dim(Mestimates$estimate)) for(k in 1:length(MestimatesAll$eval.points[[3]])){ densityPart[,,k] <- densityPart[,,k]/sum(densityPart[,,k], na.rm=TRUE) densityAll[,,k] <- densityAll[,,k]/sum(densityAll[,,k], na.rm=TRUE) densityAll[,,k] <- densityAll[,,k]+1E-96 proportion[,,k] <- (densityPart[,,k]/densityAll[,,k]) * (sum(data[data[,5]==unique(data[,5])[k],][,3])/sum(data[data[,5]==unique(data[,5])[k],][,4])) } proportion[proportion<0]=0 proportion[proportion>1]=1 Mestimates$estimate[is.na(Mestimates$estimate)]=1E-96 ret$resultDensity[j,,,] <- Mestimates$estimate ret$proportionArray[j,,,] <- proportion ret$resultX[j,,] <- new }) printInfo("Iteration: ",j," of ", burnin+samples," in chain ", chain) } ret$Mestimates = Mestimates if (is.na(saveAsBaseFileName)) { return( ret ) } else { save(ret, file = paste(saveAsBaseFileName, chain, sep=""), envir = environment()) return( NA ) } } #' Transfer observations to other shape #' @param Mestimates Estimation object created by functions dshapebivr and dbivr #' @param shapefile The new shapefile for which the observations shall be transferred to #' @return #' The function returns the count, sd and 90%-coverage interval of the observations in the different shapfile: #' @export toOtherShape <- function(Mestimates, shapefile) { meanCount <- rep(NA, length(shapefile@polygons)) sdCount <- rep(NA, length(shapefile@polygons)) q0.05Count <- rep(NA, length(shapefile@polygons)) q0.95Count <- rep(NA, length(shapefile@polygons)) for (i in 1:length(shapefile@polygons)) { counts <- Reduce(c, lapply(1:dim(Mestimates$resultX)[1], function(j) { sapply((Mestimates$burnin + 1):(Mestimates$burnin + Mestimates$samples), function(x) { lapply(1:length(shapefile@polygons[[i]]@Polygons), function(k) { if (shapefile@polygons[[i]]@Polygons[[k]]@hole == FALSE) { if(length(dim(Mestimates$resultX)) == 3){ which( point.in.polygon( Mestimates$resultX[x, , 1], Mestimates$resultX[x, , 2], shapefile@polygons[[i]]@Polygons[[k]]@coords[, 1], shapefile@polygons[[i]]@Polygons[[k]]@coords[, 2]) == 1 ) } else { which( point.in.polygon( Mestimates$resultX[j, x, , 1], Mestimates$resultX[j, x, , 2], shapefile@polygons[[i]]@Polygons[[k]]@coords[, 1], shapefile@polygons[[i]]@Polygons[[k]]@coords[, 2]) == 1 ) } } }) %>% unlist %>% unique %>% length }) })) meanCount[i] <- round(mean(counts)) sdCount[i] <- round(sd(counts)) q0.05Count[i] <- round(quantile(counts, 0.05)) q0.95Count[i] <- round(quantile(counts, 0.95)) } return(data.frame(meanCount, sdCount, q0.05Count, q0.95Count)) } setKernelheapingLoglevel <- function(logInfo = TRUE, logDump = FALSE, logTiming = FALSE) { assign("logLevelInfo" , logInfo , envir=kernelheapingEnv) assign("logLevelDump" , logDump , envir=kernelheapingEnv) assign("logLevelTiming", logTiming, envir=kernelheapingEnv) } setKernelheapingOutputLevel <- function(outputLevel) { assign("outputLevel" , outputLevel , envir=kernelheapingEnv) } isOutputlevelHigh <- function() { if ( exists("outputLevel", envir=kernelheapingEnv) && get("outputLevel", envir=kernelheapingEnv) > 0) { return(TRUE) } else { return(FALSE) } } printInfo <- function(...) { if ( exists("logLevelInfo", envir=kernelheapingEnv) && get("logLevelInfo", envir=kernelheapingEnv)) { print( paste(sep = "", ...) ) } } printDump <- function(...) { if ( exists("logLevelDump", envir=kernelheapingEnv) && get("logLevelDump", envir=kernelheapingEnv)) { print( paste(sep = "", ...) ) } } printTiming <- function(descr, ...) { if ( exists("logLevelTiming", envir=kernelheapingEnv) && get("logLevelTiming", envir=kernelheapingEnv)) { t = system.time( ... ) print( paste(sep = "", "Timing ", descr, ": ", round(t[1], digits=2), " (", round(t[3], digits=2), ")") ) } else { (...) } } Kernelheaping/R/Kernelheaping.R0000644000176200001440000000204014135254045016125 0ustar liggesusers#' Kernel Density Estimation for Heaped Data #' #' In self-reported or anonymized data the user often encounters heaped data, #' i.e. data which are rounded (to a possibly different degree of coarseness). #' While this is mostly a minor problem in parametric density estimation the bias can be very large #' for non-parametric methods such as kernel density estimation. This package implements a partly #' Bayesian algorithm treating the true unknown values as additional parameters and estimates the #' rounding parameters to give a corrected kernel density estimate. It supports various standard #' bandwidth selection methods. Varying rounding probabilities (depending on the true value) and #' asymmetric rounding is estimable as well. Additionally, bivariate non-parametric density estimation #' for rounded data is supported. #' #' #' The most important function is \code{\link{dheaping}}. See the help and the attached examples on how to use the package. #' @docType package #' @name Kernelheaping #' @import MASS ks sparr NULLKernelheaping/R/datasets.R0000644000176200001440000000064414135254045015171 0ustar liggesusers#' @name students #' @title Student0405 #' @description Data collected during 2004 and 2005 from students in statistics classes #' at a large state university in the northeastern United States. #' @docType data #' @references Utts, J. M., & Heckard, R. F. (2011). Mind on statistics. Cengage Learning. #' @source http://mathfaculty.fullerton.edu/mori/Math120/Data/readme%20datasets/readme.html#Student0405 NULL Kernelheaping/MD50000644000176200001440000000223214174313054013337 0ustar liggesusersaad08ebc4d9d0e48bd164d74f86a3383 *DESCRIPTION ba615fe7d521db134599638a4953bf83 *NAMESPACE 7249f70a0ddffada98c4bdbffa7a6d65 *R/Kernelheaping.R d9e708b7404d20f929b72be892a2d360 *R/datasets.R a52fed42a440d4f3bd64b466ed0c4c0e *R/functions.R d744be538667a842810ad17d52ffeb2c *data/students.rda a15f58691e1555eb343e436af413c8b7 *man/Kernelheaping.Rd f639c0402f348678f442f9d1dddfaf6a *man/createSim.Kernelheaping.Rd f042358e654d4d675c5593f871923e8e *man/dbivr.Rd 52ec4dec4def11e26d7537d3218a5d67 *man/dclass.Rd 776ea4d85e33e71b8eeace52ec4059ee *man/dheaping.Rd 01cd82b3fecd45805209dc725eeeb9af *man/dshape3dProp.Rd 36312e7026024bdfea23bf931fc27a0b *man/dshapebivr.Rd 26a9a5f58a36007a2a3553eb2ea29f3e *man/dshapebivrProp.Rd 62c1159e95171806303091547a34e8fd *man/plot.Kernelheaping.Rd ae6a2f339f314b6239a49b233a9c2d87 *man/plot.bivrounding.Rd 2f2d1713ad95200210533fc84ebcccfc *man/sim.Kernelheaping.Rd f8025109387d9ef4024dbf77b2f865da *man/simSummary.Kernelheaping.Rd df1676a46a7c365b5a9596585b883e46 *man/students.Rd fef6f3bcffd205766394977d0dbe7dac *man/summary.Kernelheaping.Rd 28bedc93e00bca93a4d25d7358d6aaa8 *man/toOtherShape.Rd 6e4c47134f04fbd7e7f582d465927192 *man/tracePlots.Rd