classInt/0000755000176200001440000000000013643057612012041 5ustar liggesusersclassInt/NAMESPACE0000644000176200001440000000105713552415607013264 0ustar liggesusersuseDynLib(classInt) importFrom(stats, quantile, kmeans, hclust, na.omit, dist, cutree, ecdf, dnorm, sd) importFrom(graphics, plot, abline, rect, par) importFrom(grDevices, colorRampPalette, nclass.Sturges) importFrom(e1071, bclust, clusters.bclust, centers.bclust) importFrom(KernSmooth, dpih) import(class) export(classIntervals, jenks.tests, findColours, findCols, getBclustClassIntervals, getHclustClassIntervals, nPartitions, classIntervals2shingle) S3method(plot, classIntervals) S3method(print, classIntervals) S3method(logLik, classIntervals) classInt/ChangeLog0000644000176200001440000003327713642403677013634 0ustar liggesusers2020-03-27 Roger Bivand * : Merge pull request #26 from dieghernan/htvignette Add "headtails" vignette 2020-03-27 dieghernan * vignettes/.gitignore, vignettes/headtailsR.Rmd, vignettes/refs_ht.bib: Create vignette 2020-03-27 dieghernan * .gitignore, DESCRIPTION: Initial config of the vignette 2020-03-21 Roger Bivand * docs/404.html, docs/authors.html, docs/index.html, docs/pkgdown.yml, docs/reference/classIntervals.html, docs/reference/findColours.html, docs/reference/findCols.html, docs/reference/getBclustClassIntervals.html, docs/reference/index.html, docs/reference/jenks.tests.html, docs/reference/logLik.classIntervals.html: rebuild pkgdown 2020-03-21 Roger Bivand * README.md: ping 2020-03-21 Roger Bivand * DESCRIPTION: add missing ) 2020-03-21 Roger Bivand * : Merge pull request #25 from dieghernan/patch-1 add ctb 2020-03-21 Roger Bivand * DESCRIPTION: Update DESCRIPTION 2020-03-21 Roger Bivand * DESCRIPTION: Update DESCRIPTION 2020-03-21 Roger Bivand * DESCRIPTION: Update DESCRIPTION Add Diego as ctb 2020-03-21 Roger Bivand * : Merge pull request #24 from dieghernan/master Add "headtails" #20 2020-03-20 dieghernan * man/classIntervals.Rd: Update man with "headtails" 2020-03-20 dieghernan * R/classInt.R: Change on initialitation of breaks: - Benefit of this is that now breaks on sytle "headtails" are initialised with units 2020-03-20 dieghernan * R/classInt.R: Update "headtails" following #22 2020-03-17 dieghernan * R/classInt.R: sytle "headtails" implemented 2020-03-20 Roger Bivand * : Merge pull request #23 from dieghernan/stylesnon Fix #22 2019-12-03 Roger Bivand * DESCRIPTION, R/classInt.R: fixes #19 2019-12-03 Roger Bivand * : commit 9f2fc7976578d00860bc4d72674ffe131d7062ed Author: Roger Bivand Date: Tue Dec 3 15:33:36 2019 +0100 2019-10-18 Roger Bivand * : Merge pull request #17 from wibeasley/patch-1 link to pkgdown site 2019-10-18 Will Beasley * DESCRIPTION: link to pkgdown site for example, like [tidyr](https://github.com/tidyverse/tidyr/blob/master/DESCRIPTION) 2019-09-17 Roger Bivand * DESCRIPTION: Update DESCRIPTION 2019-09-17 Roger Bivand * : Merge pull request #14 from billdenney/add-ctb Add Bill Denney as a contributor 2019-09-17 Bill Denney * DESCRIPTION: Add Bill Denney as a contributor 2019-09-17 Bill Denney * NAMESPACE: Include stats imports for logLik 2019-09-17 Bill Denney * tests/test_Unique.Rout.save: Update test to show logLik tests 2019-09-17 Bill Denney * R/logLik.R, man/logLik.classIntervals.Rd, tests/test_Unique.R: Improve documentation, add tests 2019-09-16 Bill Denney * DESCRIPTION, NAMESPACE, R/logLik.R, man/logLik.classIntervals.Rd: Add logLik method (Fix #6) 2019-08-11 Edzer Pebesma * DESCRIPTION, R/classInt.R: bump version, add support for Date 2019-07-23 Roger Bivand * : commit 33c05c2763ef7aa25d5a31446a36b6ed131729fd Author: Roger Bivand Date: Tue Jul 23 17:49:45 2019 +0200 2019-07-23 Roger Bivand * : Merge pull request #9 from r-spatial/trytravis try units travis 2019-07-23 Roger Bivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: added POSIXt and units support 2019-05-09 Roger Bivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: restrict sampling to relevant styles 2019-04-23 Roger Bivand * DESCRIPTION, tests/test_Unique.R, tests/test_Unique.Rout.save: reverse R 3.6 dependency 2019-04-22 Roger Bivand * docs/authors.html, docs/index.html, docs/reference/classIntervals.html, docs/reference/jenks.tests.html: update docs 2019-04-22 Roger Bivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd, tests/test_Unique.Rout.save: R 3.6 sample ready and sample jenks/fisher like QGIS #7 2019-03-25 Roger Bivand * docs/reference/classIntervals.html, docs/reference/findColours.html, docs/reference/findCols.html, docs/reference/getBclustClassIntervals.html, docs/reference/index.html, docs/reference/jenks.tests.html: update docs for dpih 2019-03-25 Roger Bivand * DESCRIPTION, NAMESPACE, R/classInt.R, man/classIntervals.Rd: add dpih 2018-12-18 Roger Bivand * DESCRIPTION: Update DESCRIPTION 2018-12-18 Roger Bivand * DESCRIPTION: Update DESCRIPTION Add NeedsCompilation=yes 2018-12-18 Roger Bivand * : Merge pull request #5 from angela-li/master Add a note to the README re: compiling from source 2018-12-18 Angela Li * README.md: Add a heading re: compiling from source 2018-12-18 Angela Li * README.md: Add note about making sure gfortran is installed 2018-12-18 Roger Bivand * DESCRIPTION, NAMESPACE, docs/_config.yml, docs/authors.html, docs/index.html, docs/reference/classIntervals.html, docs/reference/findColours.html, docs/reference/findCols.html, docs/reference/getBclustClassIntervals.html, docs/reference/index.html, docs/reference/jenks.tests.html, man/classIntervals.Rd, man/findColours.Rd, man/findCols.Rd, man/getBclustClassIntervals.Rd, man/jenks.tests.Rd, tests/test_Unique.Rout.save: move spData to Suggests 2018-12-14 Roger Bivand * docs/_config.yml: Set theme jekyll-theme-minimal 2018-12-13 Roger Bivand * .Rbuildignore, DESCRIPTION, docs/authors.html, docs/docsearch.css, docs/docsearch.js, docs/index.html, docs/link.svg, docs/pkgdown.css, docs/pkgdown.js, docs/pkgdown.yml, docs/reference/classIntervals.html, docs/reference/findColours.html, docs/reference/findCols.html, docs/reference/getBclustClassIntervals.html, docs/reference/index.html, docs/reference/jenks.tests.html: add pkgdown 2018-04-16 Roger Bivand * tests/test_Unique.Rout.save: add coerce to shingle to NAMESPACE 2018-04-16 Roger Bivand * DESCRIPTION: add coerce to shingle to NAMESPACE 2018-04-16 Roger Bivand * NAMESPACE: add coerce to shingle to NAMESPACE 2017-11-03 Roger Bivand * .travis.yml: Update .travis.yml 2017-11-02 Roger Bivand * README.md: Update README.md 2017-11-02 Roger Bivand * NAMESPACE, tests/test_Unique.Rout.save: rebasing data to spData 2017-11-02 Roger Bivand * DESCRIPTION, inst/ChangeLog, man/classIntervals.Rd, man/findColours.Rd, man/findCols.Rd, man/getBclustClassIntervals.Rd, man/jenks.tests.Rd, man/jenks71.Rd: rebasing data to spData 2017-11-02 Roger Bivand * ChangeLog, DESCRIPTION, oChangeLog: tidying old ChangeLog 2017-11-02 Roger Bivand * .Rbuildignore: Update .Rbuildignore 2017-11-02 Roger Bivand * README.md: add badges to README 2017-11-02 Roger Bivand * .travis.yml: Create .travis.yml 2017-11-02 Roger Bivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: adding argument to suppress too small n warning 2017-11-01 Roger Bivand * .Rbuildignore: Update .Rbuildignore 2017-11-01 Roger Bivand * README.md: Create README.md 2017-10-22 Edzer Pebesma * : commit 455886261f64b085130f8de88d06c86d474e4a6d Author: Edzer Pebesma Date: Sun Oct 22 13:42:03 2017 +0200 2017-10-22 Roger Bivand * DESCRIPTION: update DESCRIPTION 2017-10-22 Roger Bivand * initial git repo ## Historical record of SVN commits 2009-2017, CVS commits up to 2009 2017-04-14 11:31 rsbivand * DESCRIPTION, src/init.c: added registration 2015-09-28 17:49 rsbivand * ChangeLog, inst/ChangeLog: tidy 2015-09-28 17:49 rsbivand * ChangeLog, DESCRIPTION: tidy 2015-06-28 12:14 rsbivand * DESCRIPTION, NAMESPACE: CRAN _R_CHECK_CODE_USAGE_WITH_ONLY_BASE_ATTACHED_=true NAMESPACE tidy 2015-04-13 15:28 rsbivand * svn2cl.xsl: move to distributed svn2cl 2015-01-10 14:20 rsbivand * data/jenks71.rda: rebuild jenks71.rda 2015-01-10 14:19 rsbivand * DESCRIPTION, data/jenks71.rda: rebuild jenks71.rda 2015-01-06 12:03 rsbivand * DESCRIPTION: tidy 2015-01-06 12:02 rsbivand * DESCRIPTION: tidy 2015-01-06 09:32 rsbivand * ChangeLog, inst/ChangeLog, man/classIntervals.Rd: improvements to jenks documentation 2015-01-05 20:00 rsbivand * ChangeLog, inst/ChangeLog: tidy 2015-01-05 20:00 rsbivand * DESCRIPTION, man/classIntervals.Rd: improvements to jenks documentation 2014-04-06 17:05 rsbivand * ChangeLog: close ring in Polygon 2013-08-30 11:55 rsbivand * ChangeLog, inst/ChangeLog: tidy 2013-08-30 11:54 rsbivand * .Rbuildignore, ChangeLog, inst/ChangeLog: tidy 2013-08-29 14:26 rsbivand * DESCRIPTION, NAMESPACE: tidy 2013-07-28 19:37 rsbivand * DESCRIPTION, NAMESPACE: thinning load depends 2013-06-22 14:40 rsbivand * ChangeLog, inst/ChangeLog: tidy 2013-06-22 14:39 rsbivand * man/classIntervals.Rd, man/findColours.Rd, man/findCols.Rd, man/jenks.tests.Rd: help line lengths 2013-06-22 14:33 rsbivand * ChangeLog, inst/ChangeLog: tidy 2013-06-22 14:33 rsbivand * DESCRIPTION: tidy 2013-06-22 14:31 rsbivand * ChangeLog, inst/ChangeLog: tidy 2013-06-12 10:46 rsbivand * man/classIntervals.Rd, man/findColours.Rd: add more documentation on cutlabels= argument 2013-02-07 10:43 rsbivand * R/classInt.R: handle non-integer GRASS parameters more forgivingly 2012-11-05 17:05 rsbivand * ChangeLog, inst/ChangeLog: tidy 2012-11-05 17:04 rsbivand * DESCRIPTION: tidy 2012-07-22 13:30 rsbivand * DESCRIPTION: Authors@R classInt 2012-07-16 13:50 rsbivand * ChangeLog, inst/ChangeLog: tidy 2012-07-16 13:49 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd, tests, tests/test_Unique.R, tests/test_Unique.Rout.save: adding unique revisions, documentation and tests 2012-07-05 17:42 rsbivand * DESCRIPTION: add unique label option, check intervalClusure 2012-07-05 17:41 rsbivand * R/classInt.R, man/classIntervals.Rd: add unique label option, check intervalClusure 2011-11-21 10:34 rsbivand * R/classInt.R, man/classIntervals.Rd: change jenks storage mode to double 2011-11-14 10:58 rsbivand * ChangeLog, inst/ChangeLog: tidy 2011-11-10 07:30 rsbivand * ChangeLog, inst/ChangeLog: dots in fixed style 2011-11-10 07:29 rsbivand * DESCRIPTION, R/classInt.R: dots in fixed style 2011-10-21 15:56 rsbivand * DESCRIPTION, R/classInt.R: classInt NA handling 2011-05-26 21:22 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: block Inf warning in print.classIntervals 2011-02-22 16:37 rsbivand * ChangeLog: tidy 2011-02-22 16:24 rsbivand * oChangeLog, svn2cl.xsl: tidy 2011-02-22 16:18 rsbivand * .: tidy 2009-12-21 10:09 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: classInterval to shingle 2009-10-20 10:22 rsbivand * ChangeLog, inst/ChangeLog: argument passing 2009-10-20 10:19 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: argument passing 2009-09-17 10:19 rsbivand * DESCRIPTION, man/classIntervals.Rd, man/findColours.Rd, ChangeLog: fix documentation links 2009-05-25 12:20 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd, man/findColours.Rd, ChangeLog: representation update 2 2009-05-25 08:17 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: representation overhaul 1 2009-05-12 10:33 rsbivand * ChangeLog: tidy 2009-05-12 10:33 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: correction to jenks style intervals 2008-01-18 22:40 rsbivand * DESCRIPTION: jenks 2007-11-21 19:13 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: Jenks 2007-09-04 14:49 rsbivand * ChangeLog: Changelog 2007-08-24 09:20 rsbivand * DESCRIPTION, man/classIntervals.Rd: methods Rd 2006-12-07 19:19 rsbivand * DESCRIPTION, src/fish1.f: E300 2006-03-20 09:30 rsbivand * DESCRIPTION, NAMESPACE, R/classInt.R, man/classIntervals.Rd, man/findColours.Rd, src/fish1.f: 1-5 2006-03-10 14:13 rsbivand * DESCRIPTION, NAMESPACE, R/classInt.R, data/jenks71.rda, man/classIntervals.Rd, man/findColours.Rd, man/findCols.Rd, man/getBclustClassIntervals.Rd, man/jenks.tests.Rd, man/jenks71.Rd: Initial revision 2006-03-10 14:13 rsbivand * DESCRIPTION, NAMESPACE, R/classInt.R, data/jenks71.rda, man/classIntervals.Rd, man/findColours.Rd, man/findCols.Rd, man/getBclustClassIntervals.Rd, man/jenks.tests.Rd, man/jenks71.Rd: initial import classInt/man/0000755000176200001440000000000013635361327012616 5ustar liggesusersclassInt/man/jenks.tests.Rd0000644000176200001440000000642313406256150015356 0ustar liggesusers\name{jenks.tests} \alias{jenks.tests} %- Also NEED an '\alias' for EACH other topic documented here. \title{Indices for assessing class intervals} \description{ The function returns values of two indices for assessing class intervals: the goodness of variance fit measure, and the tabular accuracy index; optionally the overview accuracy index is also returned if the \code{area} argument is not missing. } \usage{ jenks.tests(clI, area) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{clI}{a "classIntervals" object} \item{area}{an optional vector of object areas if the overview accuracy index is also required} } \details{ The goodness of variance fit measure is given by Armstrong et al. (2003, p. 600) as: \deqn{GVF = 1 - \frac{\sum_{j=1}^{k}\sum_{i=1}^{N_j}{(z_{ij} - \bar{z}_j)}^2}{\sum_{i=1}^{N}{(z_{i} - \bar{z})}^2}} where the \eqn{z_{i}, i=1,\ldots,N} are the observed values, \eqn{k} is the number of classes, \eqn{\bar{z}_j} the class mean for class \eqn{j}, and \eqn{N_j} the number of counties in class \eqn{j}. The tabular accuracy index is given by Armstrong et al. (2003, p. 600) as: \deqn{TAI = 1 - \frac{\sum_{j=1}^{k}\sum_{i=1}^{N_j}{|z_{ij} - \bar{z}_j|}}{\sum_{i=1}^{N}{|z_{i} - \bar{z}|}}} The overview accuracy index for polygon observations with known areas is given by Armstrong et al. (2003, p. 600) as: \deqn{OAI = 1 - \frac{\sum_{j=1}^{k}\sum_{i=1}^{N_j}{|z_{ij} - \bar{z}_j| a_{ij}}}{\sum_{i=1}^{N}{|z_{i} - \bar{z}| a_i}}} where \eqn{a_i, i=1,\ldots,N} are the polygon areas, and as above the \eqn{a_{ij}} term is indexed over \eqn{j=1,\ldots,k} classes, and \eqn{i=1,\ldots,N_j} polygons in class \eqn{j}. } \value{ a named vector of index values } \references{Armstrong, M. P., Xiao, N., Bennett, D. A., 2003. "Using genetic algorithms to create multicriteria class intervals for choropleth maps". Annals, Association of American Geographers, 93 (3), 595--623; Jenks, G. F., Caspall, F. C., 1971. "Error on choroplethic maps: definition, measurement, reduction". Annals, Association of American Geographers, 61 (2), 217--244} \author{Roger Bivand } \seealso{\code{\link{classIntervals}}} \examples{ if (!require("spData", quietly=TRUE)) { message("spData package needed for examples") run <- FALSE } else { run <- TRUE } if (run) { data(jenks71, package="spData") fix5 <- classIntervals(jenks71$jenks71, n=5, style="fixed", fixedBreaks=c(15.57, 25, 50, 75, 100, 155.30)) print(jenks.tests(fix5, jenks71$area)) } if (run) { q5 <- classIntervals(jenks71$jenks71, n=5, style="quantile") print(jenks.tests(q5, jenks71$area)) } if (run) { set.seed(1) k5 <- classIntervals(jenks71$jenks71, n=5, style="kmeans") print(jenks.tests(k5, jenks71$area)) } if (run) { h5 <- classIntervals(jenks71$jenks71, n=5, style="hclust", method="complete") print(jenks.tests(h5, jenks71$area)) } if (run) { print(jenks.tests(getHclustClassIntervals(h5, k=7), jenks71$area)) } if (run) { print(jenks.tests(getHclustClassIntervals(h5, k=9), jenks71$area)) } if (run) { set.seed(1) b5 <- classIntervals(jenks71$jenks71, n=5, style="bclust") print(jenks.tests(b5, jenks71$area)) } if (run) { print(jenks.tests(getBclustClassIntervals(b5, k=7), jenks71$area)) } if (run) { print(jenks.tests(getBclustClassIntervals(b5, k=9), jenks71$area)) } } \keyword{spatial} classInt/man/findColours.Rd0000644000176200001440000000464613406256150015377 0ustar liggesusers\name{findColours} \alias{findColours} %- Also NEED an '\alias' for EACH other topic documented here. \title{assign colours to classes from classInterval object} \description{ This helper function is a wrapper for \code{findCols} to extract classes from a "classInterval" object and assign colours from a palette created by \code{colorRampPalette} from the two or more colours given in the \code{pal} argument. It also returns two attributes for use in constructing a legend. } \usage{ findColours(clI, pal, under="under", over="over", between="-", digits = getOption("digits"), cutlabels=TRUE) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{clI}{a "classIntervals" object} \item{pal}{a character vector of at least two colour names; \code{colorRampPalette} is used internally to create the required number of colours} \item{under}{character string value for "under" in legend if cutlabels=FALSE} \item{over}{character string value for "over" in legend if cutlabels=FALSE} \item{between}{character string value for "between" in legend if cutlabels=FALSE} \item{digits}{minimal number of significant digits in legend} \item{cutlabels}{use cut-style labels in legend} } \value{ a character vector of colours with attributes: "table", a named frequency table; "palette", a character vector of colours corresponding to the specified breaks. } \author{Roger Bivand } \seealso{\code{\link{classIntervals}}} \examples{ if (!require("spData", quietly=TRUE)) { message("spData package needed for examples") run <- FALSE } else { run <- TRUE } if (run) { data(jenks71, package="spData") pal1 <- c("wheat1", "red3") opar <- par(mfrow=c(2,2)) hCI5 <- classIntervals(jenks71$jenks71, n=5, style="hclust", method="complete") plot(attr(hCI5, "par")) plot(hCI5, pal=pal1, main="hclust k=5") plot(getHclustClassIntervals(hCI5, k=7), pal=pal1, main="hclust k=7") plot(getHclustClassIntervals(hCI5, k=9), pal=pal1, main="hclust k=9") par(opar) } if (run) { set.seed(1) bCI5 <- classIntervals(jenks71$jenks71, n=5, style="bclust") plot(attr(bCI5, "par")) } if (run) { opar <- par(mfrow=c(2,2)) plot(getBclustClassIntervals(bCI5, k=3), pal=pal1, main="bclust k=3") plot(bCI5, pal=pal1, main="bclust k=5") plot(getBclustClassIntervals(bCI5, k=7), pal=pal1, main="bclust k=7") plot(getBclustClassIntervals(bCI5, k=9), pal=pal1, main="bclust k=9") par(opar) } } \keyword{spatial} classInt/man/logLik.classIntervals.Rd0000644000176200001440000000520313552415607017321 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/logLik.R \name{logLik.classIntervals} \alias{logLik.classIntervals} \title{Log-likelihood for classIntervals objects} \usage{ \method{logLik}{classIntervals}(object, ...) } \arguments{ \item{object}{A classIntervals object} \item{...}{Ignored.} } \value{ A `logLik` object (see `stats::logLik`). } \description{ Log-likelihood for classIntervals objects } \details{ Generally, the likelihood is a method for minimizing the standard deviation within an interval, and with the AIC, a per-interval penalty can be used to maximize the information and self-similarity of data in the interval. Based on Birge 2006 and Davies 2009 (see references), interval binning selections may be compared by likelihood to optimize the number of intervals selected for a set of data. The `logLik()` function (and associated `AIC()` function) can be used to optimize binning by maximizing the likelihood across choices of intervals. As illustrated by the examples below (the AIC comparison does not specifically select 3 intervals when comparing 2, 3, and 4 intervals for data with 3 intervals), while likelihood-based methods can provide evidence toward optimization of binning, they are not infallible for bin selection. } \examples{ x <- classIntervals(rnorm(100), n=5, style="fisher") logLik(x) AIC(x) # By having a logLik method, AIC.default is used. # When the intervals are made of a limited number of discrete values, the # logLik is zero by definition (the standard deviation is zero giving a dirac # function at the discrete value indicating a density of 1 and a log-density # of zero). x <- classIntervals(rep(1:2, each=10), n=2, style="jenks") logLik(x) x <- classIntervals(rep(1:3, each=10), n=2, style="jenks") logLik(x) # With slight jitter but notable categorical intervals (at 1, 2, and 3), the # AIC will make selection of the optimal intervals easier. data <- rep(1:3, each=100) + runif(n=300, min=-0.01, max=0.01) x_2 <- classIntervals(data, n=2, style="jenks") x_3 <- classIntervals(data, n=3, style="jenks") x_4 <- classIntervals(data, n=4, style="jenks") AIC(x_2, x_3, x_4) } \references{ Lucien Birge, Yves Rozenholc. How many bins should be put in a regular histogram. ESAIM: Probability and Statistics. 31 January 2006. 10:24-45. url: https://www.esaim-ps.org/articles/ps/abs/2006/01/ps0322/ps0322.html. doi:10.1051/ps:2006001 Laurie Davies, Ursula Gather, Dan Nordman, Henrike Weinert. A comparison of automatic histogram constructions. ESAIM: Probability and Statistics. 11 June 2009. 13:181-196. url: https://www.esaim-ps.org/articles/ps/abs/2009/01/ps0721/ps0721.html doi:10.1051/ps:2008005 } classInt/man/classIntervals.Rd0000644000176200001440000004206313635361327016107 0ustar liggesusers\name{classIntervals} \alias{classIntervals} \alias{print.classIntervals} \alias{plot.classIntervals} \alias{nPartitions} \alias{classIntervals2shingle} %- Also NEED an '\alias' for EACH other topic documented here. \title{Choose univariate class intervals} \description{ The function provides a uniform interface to finding class intervals for continuous numerical variables, for example for choosing colours or symbols for plotting. Class intervals are non-overlapping, and the classes are left-closed --- see \code{findInterval}. Argument values to the style chosen are passed through the dot arguments. \code{classIntervals2shingle} converts a \code{classIntervals} object into a shingle. Labels generated in methods are like those found in \code{\link{cut}} unless cutlabels=FALSE. } \usage{ classIntervals(var, n, style = "quantile", rtimes = 3, ..., intervalClosure = c("left", "right"), dataPrecision = NULL, warnSmallN = TRUE, warnLargeN = TRUE, largeN = 3000L, samp_prop = 0.1, gr = c("[", "]")) \method{plot}{classIntervals}(x, pal, ...) \method{print}{classIntervals}(x, digits = getOption("digits"), ..., under="under", over="over", between="-", cutlabels=TRUE, unique=FALSE) nPartitions(x) classIntervals2shingle(x) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{var}{a continuous numerical variable} \item{n}{number of classes required, if missing, \code{nclass.Sturges} is used; see also the "dpih" and "headtails" styles for automatic choice of the number of classes} \item{style}{chosen style: one of "fixed", "sd", "equal", "pretty", "quantile", "kmeans", "hclust", "bclust", "fisher", "jenks", "dpih" or "headtails"} \item{rtimes}{number of replications of var to catenate and jitter; may be used with styles "kmeans" or "bclust" in case they have difficulties reaching a classification} \item{intervalClosure}{default \dQuote{left}, allows specification of whether partition intervals are closed on the left or the right (added by Richard Dunlap). Note that the sense of interval closure is hard-coded as \dQuote{right}-closed when\code{style="jenks"} (see Details below).} \item{dataPrecision}{default NULL, permits rounding of the interval endpoints (added by Richard Dunlap)} \item{warnSmallN}{default TRUE, if FALSE, quietens warning for n >= nobs} \item{warnLargeN}{default TRUE, if FALSE large data handling not used} \item{largeN}{default 3000L, the QGIS sampling threshold; over 3000, the observations presented to "fisher" and "jenks" are either a \code{samp_prop=} sample or a sample of 3000, whichever is larger} \item{samp_prop}{default 0.1, QGIS 10\% sampling proportion} \item{gr}{default \code{c("[", "]")}, if the \pkg{units} package is available, \code{units::units_options("group")} may be used directly to give the enclosing bracket style} \item{\dots}{arguments to be passed to the functions called in each style} \item{x}{"classIntervals" object for printing, conversion to shingle, or plotting} \item{under}{character string value for "under" in printed table labels if cutlabels=FALSE} \item{over}{character string value for "over" in printed table labels if cutlabels=FALSE} \item{between}{character string value for "between" in printed table labels if cutlabels=FALSE} \item{digits}{minimal number of significant digits in printed table labels} \item{cutlabels}{default TRUE, use cut-style labels in printed table labels} \item{unique}{default FALSE; if TRUE, collapse labels of single-value classes} \item{pal}{a character vector of at least two colour names for colour coding the class intervals in an ECDF plot; \code{colorRampPalette} is used internally to create the correct number of colours} } \details{ The "fixed" style permits a "classIntervals" object to be specified with given breaks, set in the \code{fixedBreaks} argument; the length of \code{fixedBreaks} should be n+1; this style can be used to insert rounded break values. The "sd" style chooses breaks based on \code{pretty} of the centred and scaled variables, and may have a number of classes different from n; the returned \code{par=} includes the centre and scale values. The "equal" style divides the range of the variable into n parts. The "pretty" style chooses a number of breaks not necessarily equal to n using \code{pretty}, but likely to be legible; arguments to \code{pretty} may be passed through \code{\dots}. The "quantile" style provides quantile breaks; arguments to \code{quantile} may be passed through \code{\dots}. The "kmeans" style uses \code{kmeans} to generate the breaks; it may be anchored using \code{set.seed}; the \code{pars} attribute returns the kmeans object generated; if \code{kmeans} fails, a jittered input vector containing \code{rtimes} replications of \code{var} is tried --- with few unique values in \code{var}, this can prove necessary; arguments to \code{kmeans} may be passed through \code{\dots}. The "hclust" style uses \code{hclust} to generate the breaks using hierarchical clustering; the \code{pars} attribute returns the hclust object generated, and can be used to find other breaks using \code{getHclustClassIntervals}; arguments to \code{hclust} may be passed through \code{\dots}. The "bclust" style uses \code{bclust} to generate the breaks using bagged clustering; it may be anchored using \code{set.seed}; the \code{pars} attribute returns the bclust object generated, and can be used to find other breaks using \code{getBclustClassIntervals}; if \code{bclust} fails, a jittered input vector containing \code{rtimes} replications of \code{var} is tried --- with few unique values in \code{var}, this can prove necessary; arguments to \code{bclust} may be passed through \code{\dots}. The "fisher" style uses the algorithm proposed by W. D. Fisher (1958) and discussed by Slocum et al. (2005) as the Fisher-Jenks algorithm; added here thanks to Hisaji Ono. This style will subsample by default for more than 3000 observations. This style should always be preferred to "jenks" as it uses the original Fortran code and runs nested for-loops much faster. The "jenks" style has been ported from Jenks' code, and has been checked for consistency with ArcView, ArcGIS, and MapInfo (with some remaining differences); added here thanks to Hisaji Ono (originally reported as Basic, now seen as Fortran (as described in a talk last seen at http://www.irlogi.ie/wp-content/uploads/2016/11/NUIM_ChoroHarmful.pdf, slides 26-27)). Note that the sense of interval closure is reversed from the other styles, and in this implementation has to be right-closed - use cutlabels=TRUE in \code{findColours} on the object returned to show the closure clearly, and use \code{findCols} to extract the classes for each value. This style will subsample by default for more than 3000 observations. The "dpih" style uses the \code{dpih()} function from \pkg{KernSmooth} (Wand, 1995) implementing direct plug-in methodology to select the bin width of a histogram. The "headtails" style uses the algorithm proposed by Bin Jiang (2013), in order to find groupings or hierarchy for data with a heavy-tailed distribution. This classification scheme partitions all of the data values around the mean into two parts and continues the process iteratively for the values (above the mean) in the head until the head part values are no longer heavy-tailed distributed. Thus, the number of classes and the class intervals are both naturally determined. By default the algorithm uses \code{thr = 0.4}, meaning that when the head represents more than 40\% of the observations the distribution is not considered heavy-tailed. The threshold argument \code{thr} may be modified through \code{\dots} (see Examples). } \value{ an object of class "classIntervals": \item{var}{the input variable} \item{brks}{a vector of breaks} and attributes: \item{style}{the style used} \item{parameters}{parameter values used in finding breaks} \item{nobs}{number of different finite values in the input variable} \item{call}{this function's call} \item{intervalClosure}{string, whether closure is \dQuote{left} or \dQuote{right}} \item{dataPrecision}{the data precision used for printing interval values in the legend returned by \code{findColours}, and in the \code{print} method for classIntervals objects. If intervalClosure is \dQuote{left}, the value returned is \code{ceiling} of the data value multiplied by 10 to the dataPrecision power, divided by 10 to the dataPrecision power.} } \references{ Armstrong, M. P., Xiao, N., Bennett, D. A., 2003. "Using genetic algorithms to create multicriteria class intervals for choropleth maps". Annals, Association of American Geographers, 93 (3), 595--623; Jenks, G. F., Caspall, F. C., 1971. "Error on choroplethic maps: definition, measurement, reduction". Annals, Association of American Geographers, 61 (2), 217--244; Dent, B. D., 1999, Cartography: thematic map design. McGraw-Hill, Boston, 417 pp.; Slocum TA, McMaster RB, Kessler FC, Howard HH 2005 Thematic Cartography and Geographic Visualization, Prentice Hall, Upper Saddle River NJ.; Fisher, W. D. 1958 "On grouping for maximum homogeneity", Journal of the American Statistical Association, 53, pp. 789--798 (\url{http://lib.stat.cmu.edu/cmlib/src/cluster/fish.f}) Wand, M. P. 1995. Data-based choice of histogram binwidth. The American Statistician, 51, 59-64. Jiang, B. 2013 "Head/tail breaks: A new classification scheme for data with a heavy-tailed distribution", The Professional Geographer, 65 (3), 482 – 494. (\url{https://arxiv.org/abs/1209.2801v1}) } \author{Roger Bivand } \note{From version 0.1-11, the default representation has been changed to use \code{cutlabels=TRUE}, and representation within intervals has been corrected, thanks to Richard Dunlap. From version 0.1-15, the print method drops the calculation of the possible number of combinations of observations into classes, which generated warnings for n > 170.} \seealso{\code{\link{findColours}}, \code{\link{findCols}}, \code{\link{pretty}}, \code{\link[stats]{quantile}}, \code{\link[stats]{kmeans}}, \code{\link[stats]{hclust}}, \code{\link[e1071]{bclust}}, \code{\link{findInterval}}, \code{\link[grDevices]{colorRamp}}, \code{\link[grDevices]{nclass}}, \code{\link[lattice]{shingle}}} \examples{ if (!require("spData", quietly=TRUE)) { message("spData package needed for examples") run <- FALSE } else { run <- TRUE } if (run) { data(jenks71, package="spData") pal1 <- c("wheat1", "red3") opar <- par(mfrow=c(2,3)) plot(classIntervals(jenks71$jenks71, n=5, style="fixed", fixedBreaks=c(15.57, 25, 50, 75, 100, 155.30)), pal=pal1, main="Fixed") plot(classIntervals(jenks71$jenks71, n=5, style="sd"), pal=pal1, main="Pretty standard deviations") plot(classIntervals(jenks71$jenks71, n=5, style="equal"), pal=pal1, main="Equal intervals") plot(classIntervals(jenks71$jenks71, n=5, style="quantile"), pal=pal1, main="Quantile") set.seed(1) plot(classIntervals(jenks71$jenks71, n=5, style="kmeans"), pal=pal1, main="K-means") plot(classIntervals(jenks71$jenks71, n=5, style="hclust", method="complete"), pal=pal1, main="Complete cluster") } if (run) { plot(classIntervals(jenks71$jenks71, n=5, style="hclust", method="single"), pal=pal1, main="Single cluster") set.seed(1) plot(classIntervals(jenks71$jenks71, n=5, style="bclust", verbose=FALSE), pal=pal1, main="Bagged cluster") plot(classIntervals(jenks71$jenks71, n=5, style="fisher"), pal=pal1, main="Fisher's method") plot(classIntervals(jenks71$jenks71, n=5, style="jenks"), pal=pal1, main="Jenks' method") plot(classIntervals(jenks71$jenks71, style="dpih"), pal=pal1, main="dpih method") plot(classIntervals(jenks71$jenks71, style="headtails", thr = 1), pal=pal1, main="Head Tails method") par(opar) } if (run) { print(classIntervals(jenks71$jenks71, n=5, style="fixed", fixedBreaks=c(15.57, 25, 50, 75, 100, 155.30))) } if (run) { print(classIntervals(jenks71$jenks71, n=5, style="sd")) } if (run) { print(classIntervals(jenks71$jenks71, n=5, style="equal")) } if (run) { print(classIntervals(jenks71$jenks71, n=5, style="quantile")) } if (run) { set.seed(1) print(classIntervals(jenks71$jenks71, n=5, style="kmeans")) } if (run) { set.seed(1) print(classIntervals(jenks71$jenks71, n=5, style="kmeans", intervalClosure="right")) } if (run) { set.seed(1) print(classIntervals(jenks71$jenks71, n=5, style="kmeans", dataPrecision=0)) } if (run) { set.seed(1) print(classIntervals(jenks71$jenks71, n=5, style="kmeans"), cutlabels=FALSE) } if (run) { print(classIntervals(jenks71$jenks71, n=5, style="hclust", method="complete")) } if (run) { print(classIntervals(jenks71$jenks71, n=5, style="hclust", method="single")) } if (run) { set.seed(1) print(classIntervals(jenks71$jenks71, n=5, style="bclust", verbose=FALSE)) } if (run) { print(classIntervals(jenks71$jenks71, n=5, style="bclust", hclust.method="complete", verbose=FALSE)) } if (run) { print(classIntervals(jenks71$jenks71, n=5, style="fisher")) } if (run) { print(classIntervals(jenks71$jenks71, n=5, style="jenks")) } if (run) { print(classIntervals(jenks71$jenks71, style="dpih")) } if (run) { print(classIntervals(jenks71$jenks71, style="dpih", range.x=c(0, 160))) } if (run) { print(classIntervals(jenks71$jenks71, style="headtails")) } if (run) { print(classIntervals(jenks71$jenks71, style="headtails", thr = .45)) } x <- c(0, 0, 0, 1, 2, 50) print(classIntervals(x, n=3, style="fisher")) print(classIntervals(x, n=3, style="jenks")) # Argument 'unique' will collapse the label of classes containing a # single value. This is particularly useful for 'censored' variables # that contain for example many zeros. data_censored<-c(rep(0,10), rnorm(100, mean=20,sd=1),rep(26,10)) plot(density(data_censored)) cl2 <- classIntervals(data_censored, n=5, style="jenks", dataPrecision=2) print(cl2, unique=FALSE) print(cl2, unique=TRUE) \dontrun{ set.seed(1) n <- 1e+05 x <- runif(n) classIntervals(x, n=5, style="sd") classIntervals(x, n=5, style="pretty") classIntervals(x, n=5, style="equal") classIntervals(x, n=5, style="quantile") # the class intervals found vary a little because of sampling classIntervals(x, n=5, style="kmeans") classIntervals(x, n=5, style="fisher") classIntervals(x, n=5, style="fisher") classIntervals(x, n=5, style="fisher") } have_units <- FALSE if (require(units, quietly=TRUE)) have_units <- TRUE if (have_units) { set.seed(1) x_units <- set_units(sample(seq(1, 100, 0.25), 100), km/h) classIntervals(x_units, n=5, style="sd") } if (have_units) { classIntervals(x_units, n=5, style="pretty") } if (have_units) { classIntervals(x_units, n=5, style="equal") } if (have_units) { classIntervals(x_units, n=5, style="quantile") } if (have_units) { classIntervals(x_units, n=5, style="kmeans") } if (have_units) { classIntervals(x_units, n=5, style="fisher") } if (have_units) { classIntervals(x_units, style="headtails") } st <- Sys.time() x_POSIXt <- sample(st+((0:500)*3600), 100) fx <- st+((0:5)*3600)*100 classIntervals(x_POSIXt, style="fixed", fixedBreaks=fx) classIntervals(x_POSIXt, n=5, style="sd") classIntervals(x_POSIXt, n=5, style="pretty") classIntervals(x_POSIXt, n=5, style="equal") classIntervals(x_POSIXt, n=5, style="quantile") classIntervals(x_POSIXt, n=5, style="kmeans") classIntervals(x_POSIXt, n=5, style="fisher") classIntervals(x_POSIXt, style="headtails") # Head Tails method is suitable for right-sided heavy-tailed distributions set.seed(1234) # Heavy tails----- # Pareto distributions a=7 b=14 paretodist <- 7 / (1 - runif(1000)) ^ (1 / 14) # Lognorm lognormdist <- rlnorm(1000) # Weibull weibulldist <- rweibull(1000, 1, scale = 5) pal1 <- c("wheat1", "red3") opar <- par(mfrow = c(2, 3)) plot(classIntervals(paretodist, style = "headtails"), pal = pal1, main = "HeadTails: Pareto Dist.") plot(classIntervals(lognormdist, style = "headtails"), pal = pal1, main = "HeadTails: LogNormal Dist.") plot(classIntervals(weibulldist, style = "headtails"), pal = pal1, main = "HeadTails: Weibull Dist.") plot(classIntervals(paretodist, n = 5, style = "fisher"), pal = pal1, main = "Fisher: Pareto Dist.") plot(classIntervals(lognormdist, n = 7, style = "fisher"), pal = pal1, main = "Fisher: LogNormal Dist.") plot(classIntervals(weibulldist, n= 4, style = "fisher"), pal = pal1, main = "Fisher: Weibull Dist.") par(opar) #Non heavy tails, thr should be increased----- #Normal dist normdist <- rnorm(1000) #Left-tailed truncated Normal distr leftnorm <- rep(normdist[normdist < mean(normdist)], 2) # Uniform distribution unifdist <- runif(1000) opar <- par(mfrow = c(2, 3)) plot(classIntervals(normdist, style = "headtails"), pal = pal1, main = "Normal Dist.") plot(classIntervals(leftnorm, style = "headtails"), pal = pal1, main = "Truncated Normal Dist.") plot(classIntervals(unifdist, style = "headtails"), pal = pal1, main = "Uniform Dist.") # thr should be increased for non heavy-tailed distributions plot( classIntervals(normdist, style = "headtails", thr = .6), pal = pal1, main = "Normal Dist. thr = .6" ) plot( classIntervals(leftnorm, style = "headtails", thr = .6), pal = pal1, main = "Truncated Normal Distribution thr = .6" ) plot( classIntervals(unifdist, style = "headtails", thr = .6), pal = pal1, main = "Uniform Distribution thr = .6" ) par(opar) } \keyword{spatial} classInt/man/findCols.Rd0000644000176200001440000000163513406256150014644 0ustar liggesusers\name{findCols} \alias{findCols} %- Also NEED an '\alias' for EACH other topic documented here. \title{extract classes from classInterval object} \description{ This helper function is a wrapper for \code{findInterval} to extract classes from a "classInterval" object } \usage{ findCols(clI) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{clI}{a "classIntervals" object} } \value{ an integer vector of class indices } \author{Roger Bivand } \seealso{\code{\link{classIntervals}}, \code{\link{findInterval}}} \examples{ if (!require("spData", quietly=TRUE)) { message("spData package needed for examples") run <- FALSE } else { run <- TRUE } if (run) { data(jenks71, package="spData") fix5 <- classIntervals(jenks71$jenks71, n=5, style="fixed", fixedBreaks=c(15.57, 25, 50, 75, 100, 155.30)) print(fix5) } if (run) { print(findCols(fix5)) } } \keyword{spatial} classInt/DESCRIPTION0000644000176200001440000000277513643057612013562 0ustar liggesusersPackage: classInt Version: 0.4-3 Date: 2020-04-05 Title: Choose Univariate Class Intervals Authors@R: c( person("Roger", "Bivand", role=c("aut", "cre"), email="Roger.Bivand@nhh.no", comment=c(ORCID="0000-0003-2392-6140")), person("Hisaji", "Ono", role="ctb"), person("Richard", "Dunlap", role="ctb"), person("Matthieu", "Stigler", role="ctb"), person("Bill", "Denney", role="ctb", email="wdenney@humanpredictions.com", comment=c(ORCID="0000-0002-5759-428X")), person("Diego", "Hernangómez", role="ctb", email="diego.hernangomezherrero@gmail.com", comment=c(ORCID="0000-0001-8457-4658"))) Depends: R (>= 2.2) Imports: grDevices, stats, graphics, e1071, class, KernSmooth Suggests: spData (>= 0.2.6.2), units, knitr, rmarkdown NeedsCompilation: yes Description: Selected commonly used methods for choosing univariate class intervals for mapping or other graphics purposes. License: GPL (>= 2) URL: https://r-spatial.github.io/classInt/, https://github.com/r-spatial/classInt/ BugReports: https://github.com/r-spatial/classInt/issues/ RoxygenNote: 6.1.1 Encoding: UTF-8 VignetteBuilder: knitr Packaged: 2020-04-05 16:34:38 UTC; rsb Author: Roger Bivand [aut, cre] (), Hisaji Ono [ctb], Richard Dunlap [ctb], Matthieu Stigler [ctb], Bill Denney [ctb] (), Diego Hernangómez [ctb] () Maintainer: Roger Bivand Repository: CRAN Date/Publication: 2020-04-07 11:10:02 UTC classInt/build/0000755000176200001440000000000013642404236013135 5ustar liggesusersclassInt/build/vignette.rds0000644000176200001440000000035213642404236015474 0ustar liggesusersmQ 0 9/x'd(GnkA|Ll@Ҵ''9m=Ƙ͸˙) .b:  Q|'٦ž@8+B(vZz'r8rjGޡNbG {א9vBhОciPAzɊӨq?5XDI-ӿ[dYaPˍƒ̼Y ս4cARZclassInt/tests/0000755000176200001440000000000013637451011013175 5ustar liggesusersclassInt/tests/test_Unique.Rout.save0000644000176200001440000003254513635361327017333 0ustar liggesusers R version 3.5.3 (2020-03-20) -- "Great Truth" Copyright (C) 2019 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > library(classInt) > set.seed(1) > data_censored<-c(rep(0,10), rnorm(100, mean=20,sd=1),rep(26,10)) > cl2<-classIntervals(data_censored, n=4, style="fixed",dataPrecision=2,fixedBreaks=c(-1,1,19,25,30)) > > print(cl2, unique=FALSE) style: fixed one of 166,650 possible partitions of this variable into 4 classes [-1,1) [1,19) [19,25) [25,30] 10 11 89 10 > print(cl2, unique=TRUE) style: fixed one of 166,650 possible partitions of this variable into 4 classes Class found with one single (possibly repeated) value: changed label 0 [1,19) [19,25) 26 10 11 89 10 > > ### example from man page > classIntervals(data_censored, n=5, style="fixed", fixedBreaks=c(15.57, 25, 50, 75, 100, 155.30)) style: fixed one of 4,082,925 possible partitions of this variable into 5 classes [15.57,25) [25,50) [50,75) [75,100) [100,155.3] 110 10 0 0 0 Warning message: In classIntervals(data_censored, n = 5, style = "fixed", fixedBreaks = c(15.57, : variable range greater than fixedBreaks > > print(classIntervals(data_censored, n=5, style="sd"), unique=FALSE) style: sd one of 79,208,745 possible partitions of this variable into 6 classes [-5.126688,0.8860022) [0.8860022,6.898692) [6.898692,12.91138) 10 0 0 [12.91138,18.92407) [18.92407,24.93676) [24.93676,30.94945] 10 90 10 > print(classIntervals(data_censored, n=5, style="sd"), unique=TRUE) style: sd one of 79,208,745 possible partitions of this variable into 6 classes Class found with one single (possibly repeated) value: changed label 0 [0.8860022,6.898692) [6.898692,12.91138) 10 0 0 [12.91138,18.92407) [18.92407,24.93676) 26 10 90 10 > print(classIntervals(data_censored, n=5, style="equal"), unique=TRUE) style: equal one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [5.2,10.4) [10.4,15.6) [15.6,20.8) [20.8,26] 10 0 0 81 29 > print(classIntervals(data_censored, n=5, style="quantile"), unique=TRUE) style: quantile one of 4,082,925 possible partitions of this variable into 5 classes [0,19.24129) [19.24129,19.87857) [19.87857,20.39315) [20.39315,21.07048) 24 24 24 24 [21.07048,26] 24 > set.seed(1) > print(classIntervals(data_censored, n=5, style="kmeans"), unique=TRUE) style: kmeans one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [8.89265,19.11514) [19.11514,20.31048) [20.31048,24.20081) 10 12 43 45 26 10 > print(classIntervals(data_censored, n=5, style="hclust", method="complete"), unique=TRUE) style: hclust one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [8.89265,19.01088) [19.01088,21.00347) [21.00347,24.20081) 10 11 74 15 26 10 > print(classIntervals(data_censored, n=5, style="hclust", method="single"), unique=TRUE) style: hclust one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [8.89265,18.33574) [18.33574,21.78784) [21.78784,24.20081) 10 3 94 3 26 10 > print(classIntervals(data_censored, n=5, style="fisher"), unique=TRUE) style: fisher one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [8.89265,19.72123) [19.72123,20.85116) [20.85116,24.20081) 10 33 49 18 26 10 > print(classIntervals(data_censored, n=5, style="jenks"), unique=TRUE) style: jenks one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 (0,19.69582] (19.69582,20.82122] (20.82122,22.40162] 10 33 49 18 26 10 > > print(classIntervals(data_censored, n=5, style="fixed", fixedBreaks=c(15.57, 25, 50, 75, 100, 155.30)), unique=TRUE) style: fixed one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label [15.57,25) 26 [50,75) [75,100) [100,155.3] 110 10 0 0 0 Warning message: In classIntervals(data_censored, n = 5, style = "fixed", fixedBreaks = c(15.57, : variable range greater than fixedBreaks > print(classIntervals(data_censored, n=5, style="sd"), unique=TRUE) style: sd one of 79,208,745 possible partitions of this variable into 6 classes Class found with one single (possibly repeated) value: changed label 0 [0.8860022,6.898692) [6.898692,12.91138) 10 0 0 [12.91138,18.92407) [18.92407,24.93676) 26 10 90 10 > print(classIntervals(data_censored, n=5, style="equal"), unique=TRUE) style: equal one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [5.2,10.4) [10.4,15.6) [15.6,20.8) [20.8,26] 10 0 0 81 29 > print(classIntervals(data_censored, n=5, style="quantile"), unique=TRUE) style: quantile one of 4,082,925 possible partitions of this variable into 5 classes [0,19.24129) [19.24129,19.87857) [19.87857,20.39315) [20.39315,21.07048) 24 24 24 24 [21.07048,26] 24 > set.seed(1) > print(classIntervals(data_censored, n=5, style="kmeans"), unique=TRUE) style: kmeans one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [8.89265,19.11514) [19.11514,20.31048) [20.31048,24.20081) 10 12 43 45 26 10 > set.seed(1) > print(classIntervals(data_censored, n=5, style="kmeans", intervalClosure="right"), unique=TRUE) style: kmeans one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 (8.89265,19.11514] (19.11514,20.31048] (20.31048,24.20081] 10 12 43 45 26 10 > set.seed(1) > print(classIntervals(data_censored, n=5, style="kmeans", dataPrecision=0), unique=TRUE) style: kmeans one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [9,20) [20,21) [21,25) 26 10 12 43 45 10 > set.seed(1) > print(classIntervals(data_censored, n=5, style="kmeans"), cutlabels=FALSE, unique=TRUE) style: kmeans one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 8.89265 - 19.11514 19.11514 - 20.31048 20.31048 - 24.20081 10 12 43 45 26 10 > print(classIntervals(data_censored, n=5, style="hclust", method="complete"), unique=TRUE) style: hclust one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [8.89265,19.01088) [19.01088,21.00347) [21.00347,24.20081) 10 11 74 15 26 10 > print(classIntervals(data_censored, n=5, style="hclust", method="single"), unique=TRUE) style: hclust one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [8.89265,18.33574) [18.33574,21.78784) [21.78784,24.20081) 10 3 94 3 26 10 > print(classIntervals(data_censored, n=5, style="fisher"), unique=TRUE) style: fisher one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [8.89265,19.72123) [19.72123,20.85116) [20.85116,24.20081) 10 33 49 18 26 10 > print(classIntervals(data_censored, n=5, style="jenks"), unique=TRUE) style: jenks one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 (0,19.69582] (19.69582,20.82122] (20.82122,22.40162] 10 33 49 18 26 10 > print(classIntervals(data_censored, style="headtails"), unique=TRUE) style: headtails one of 101 possible partitions of this variable into 2 classes [0,18.92407) [18.92407,26] 20 100 > print(classIntervals(data_censored, style="headtails", thr = 1)) style: headtails one of 166,650 possible partitions of this variable into 4 classes [0,18.92407) [18.92407,20.86153) [20.86153,23.03872) [23.03872,26] 20 72 18 10 > print(classIntervals(data_censored, style="headtails", thr = 0)) style: headtails one of 101 possible partitions of this variable into 2 classes [0,18.92407) [18.92407,26] 20 100 > x <- c(0, 0, 0, 1, 2, 50) > print(classIntervals(x, n=3, style="fisher"), unique=TRUE) style: fisher one of 3 possible partitions of this variable into 3 classes Class found with one single (possibly repeated) value: changed label 0 [0.5,26) 50 3 2 1 > print(classIntervals(x, n=3, style="jenks"), unique=TRUE) style: jenks one of 3 possible partitions of this variable into 3 classes Class found with one single (possibly repeated) value: changed label 0 (0,2] 50 3 2 1 > if (getRversion() > "3.5.3") { + suppressWarnings(set.seed(1, sample.kind=c("Rounding"))) + } else { + set.seed(1) + } > print(classIntervals(data_censored, n=5, style="bclust", verbose=FALSE), unique=TRUE) style: bclust one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [8.89265,19.01088) [19.01088,21.00347) [21.00347,24.20081) 10 11 74 15 26 10 > print(classIntervals(data_censored, n=5, style="bclust", hclust.method="complete", verbose=FALSE), unique=TRUE) style: bclust one of 4,082,925 possible partitions of this variable into 5 classes Class found with one single (possibly repeated) value: changed label 0 [8.89265,19.79106) [19.79106,21.28327) [21.28327,24.20081) 10 34 57 9 26 10 > > # the log-likelihood returns a valid logLik object. > stopifnot( + identical( + round(logLik(classIntervals(rep(1:3, each=10), n=2, style="jenks")), 5), + structure(-14.52876, df = 2, nobs = 30L, class = "logLik") + ) + ) > # logLik for exact intervals (a single value is the unique member of an > # interval) yields a likelihood of zero. > stopifnot( + identical( + suppressWarnings(logLik(classIntervals(rep(1:3, each=10), n=3, style="jenks"))), + structure(0, df = 3, nobs = 30L, class = "logLik") + ) + ) > > proc.time() user system elapsed 0.164 0.028 0.183 classInt/tests/test_Unique.R0000644000176200001440000000627113635361327015643 0ustar liggesuserslibrary(classInt) set.seed(1) data_censored<-c(rep(0,10), rnorm(100, mean=20,sd=1),rep(26,10)) cl2<-classIntervals(data_censored, n=4, style="fixed",dataPrecision=2,fixedBreaks=c(-1,1,19,25,30)) print(cl2, unique=FALSE) print(cl2, unique=TRUE) ### example from man page classIntervals(data_censored, n=5, style="fixed", fixedBreaks=c(15.57, 25, 50, 75, 100, 155.30)) print(classIntervals(data_censored, n=5, style="sd"), unique=FALSE) print(classIntervals(data_censored, n=5, style="sd"), unique=TRUE) print(classIntervals(data_censored, n=5, style="equal"), unique=TRUE) print(classIntervals(data_censored, n=5, style="quantile"), unique=TRUE) set.seed(1) print(classIntervals(data_censored, n=5, style="kmeans"), unique=TRUE) print(classIntervals(data_censored, n=5, style="hclust", method="complete"), unique=TRUE) print(classIntervals(data_censored, n=5, style="hclust", method="single"), unique=TRUE) print(classIntervals(data_censored, n=5, style="fisher"), unique=TRUE) print(classIntervals(data_censored, n=5, style="jenks"), unique=TRUE) print(classIntervals(data_censored, n=5, style="fixed", fixedBreaks=c(15.57, 25, 50, 75, 100, 155.30)), unique=TRUE) print(classIntervals(data_censored, n=5, style="sd"), unique=TRUE) print(classIntervals(data_censored, n=5, style="equal"), unique=TRUE) print(classIntervals(data_censored, n=5, style="quantile"), unique=TRUE) set.seed(1) print(classIntervals(data_censored, n=5, style="kmeans"), unique=TRUE) set.seed(1) print(classIntervals(data_censored, n=5, style="kmeans", intervalClosure="right"), unique=TRUE) set.seed(1) print(classIntervals(data_censored, n=5, style="kmeans", dataPrecision=0), unique=TRUE) set.seed(1) print(classIntervals(data_censored, n=5, style="kmeans"), cutlabels=FALSE, unique=TRUE) print(classIntervals(data_censored, n=5, style="hclust", method="complete"), unique=TRUE) print(classIntervals(data_censored, n=5, style="hclust", method="single"), unique=TRUE) print(classIntervals(data_censored, n=5, style="fisher"), unique=TRUE) print(classIntervals(data_censored, n=5, style="jenks"), unique=TRUE) print(classIntervals(data_censored, style="headtails"), unique=TRUE) print(classIntervals(data_censored, style="headtails", thr = 1)) print(classIntervals(data_censored, style="headtails", thr = 0)) x <- c(0, 0, 0, 1, 2, 50) print(classIntervals(x, n=3, style="fisher"), unique=TRUE) print(classIntervals(x, n=3, style="jenks"), unique=TRUE) if (getRversion() > "3.5.3") { suppressWarnings(set.seed(1, sample.kind=c("Rounding"))) } else { set.seed(1) } print(classIntervals(data_censored, n=5, style="bclust", verbose=FALSE), unique=TRUE) print(classIntervals(data_censored, n=5, style="bclust", hclust.method="complete", verbose=FALSE), unique=TRUE) # the log-likelihood returns a valid logLik object. stopifnot( identical( round(logLik(classIntervals(rep(1:3, each=10), n=2, style="jenks")), 5), structure(-14.52876, df = 2, nobs = 30L, class = "logLik") ) ) # logLik for exact intervals (a single value is the unique member of an # interval) yields a likelihood of zero. stopifnot( identical( suppressWarnings(logLik(classIntervals(rep(1:3, each=10), n=3, style="jenks"))), structure(0, df = 3, nobs = 30L, class = "logLik") ) ) classInt/src/0000755000176200001440000000000013642404236012625 5ustar liggesusersclassInt/src/init.c0000644000176200001440000000071213240522652013731 0ustar liggesusers#include #include // for NULL #include /* .Fortran calls */ extern void F77_NAME(fish)(void *, void *, void *, void *, void *, void *, void *, void *); static const R_FortranMethodDef FortranEntries[] = { {"fish", (DL_FUNC) &F77_NAME(fish), 8}, {NULL, NULL, 0} }; void R_init_classInt(DllInfo *dll) { R_registerRoutines(dll, NULL, NULL, FortranEntries, NULL); R_useDynamicSymbols(dll, FALSE); } classInt/src/fish1.f0000644000176200001440000001220713633734704014016 0ustar liggesusersC SUBROUTINE FISH(M, X, VLAB, RLAB, TITLE, K, DMWORK, WORK, DMIWRK, C * IWORK, OUNIT) SUBROUTINE FISH(M, X, K, DMWORK, WORK, DMIWRK, IWORK, LLOUT) C C<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> C C PURPOSE C ------- C C CLUSTERS A SEQUENCE OF CASES INTO SUBSEQUENCES BY FISHER'S C METHOD OF EXACT OPTIMIZATION C C DESCRIPTION C ----------- C C 1. THE "EXACT OPTIMIZATION" METHOD OF W. D. FISHER MAXIMIZES THE C BETWEEN-CLUSTER SUM OF SQUARES. NOTE THAT THE PARTITION IS C GUARANTEED OPTIMAL BUT NOT UNIQUE. C C 2. IF A PARTITION INTO K CLUSTERS IS REQUESTED, OPTIMAL PARTITIONS C INTO K-1, K-2, ..., 2, 1 CLUSTERS ARE ALSO FOUND AND INCLUDED C IN THE OUTPUT. C C 3. THE OUTPUT IS WRITTEN ON FORTRAN UNIT OUNIT AND CONSISTS OF THE C VECTOR OF CASE LABELS AND THE VECTOR OF THE OBSERVATIONS. THEN C THE OPTIMAL PARTITIONS INTO K, K-1, ..., 2, 1 SUBSETS WITH C SUMMARY STATISTICS ARE PRINTED. THEY INCLUDE THE MEAN AND C STANDARD DEVIATION OF THE OBSER- VATIONS FOR EACH CLUSTER FOR C EACH PARTIION. THE MEMBERS OF THE FIRST CLUSTER FOR ANY C PARTITION BEGIN AT THE TOP OF THE VECTOR OF LABELS AND CONTINUE C FOR THE NUMBER IN THE CLUSTER. C C INPUT PARAMETERS C ---------------- C R1MACH(2) = B**EMAX*(1 - B**(-T)), the largest magnitude. C C M INTEGER SCALAR (UNCHANGED ON OUTPUT). C THE NUMBER OF CASES. C C X REAL VECTOR DIMENSIONED AT LEAST M (UNCHANGED ON OUTPUT) C OBSERVED VALUES. C C K INTEGER SCALAR (UNCHANGED ON OUTPUT). C THE NUMBER OF CLUSTER SUBSEQUENCES REQUESTED. C C VLAB 4-CHARACTER VARIABLE (UNCHANGED ON OUTPUT). C THE LABEL OF THE VARIABLE. C C RLAB VECTOR OF 4-CHARACTER VARIABLES DIMENSIONED AT LEAST M. C (UNCHANGED ON OUTPUT). C THE LABELS OF THE CASES. C C TITLE 10-CHARACTER VARIABLE (UNCHANGED ON OUTPUT). C TITLE OF THE DATA SET. C C DMWORK INTEGER SCALAR (UNCHANGED ON OUTPUT). C THE LEADING DIMENSION OF THE MATRIX WORK. MUST BE AT LEAST M. C C WORK REAL MATRIX WHOSE FIRST DIMENSION MUST BE DMWORK AND SECOND C DIMENSION MUST BE AT LEAST K. C WORK MATRIX. C C DMIWRK INTEGER SCALAR (UNCHANGED ON OUTPUT). C THE LEADING DIMENSION OF THE MATRIX IWORK. MUST BE AT LEAST M. C C IWORK INTEGER MATRIX WHOSE FIRST DIMENSION MUST BE DMIWRK AND SECOND C DIMENSION MUST BE AT LEAST K. C WORK MATRIX. C C OUNIT INTEGER SCALAR (UNCHANGED ON OUTPUT). C UNIT NUMBER FOR OUTPUT. C C REFERENCES C ---------- C C FISHER, W. D. (1958). "ON GROUPING FOR MAXIMAL HOMOGENEITY," C JOURNAL OF THE AMERICAN STATISTICAL ASSOCIATION 53, 789-798. C C HARTIGAN, J. A. (1975). CLUSTERING ALGORITHMS, JOHN WILEY & C SONS, INC., NEW YORK. PAGES 130-142. C C<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> C C INTEGER DMWORK, DMIWRK, OUNIT IMPLICIT LOGICAL(A-Z) INTEGER DMWORK, DMIWRK, IWORK INTEGER I, J, K, M, II, III, IK, JJ, L, LL, IL, IU DOUBLE PRECISION X, WORK, LLOUT DIMENSION X(*), WORK(DMWORK,*), IWORK(DMIWRK,*), LLOUT(K,*) C CHARACTER*4 VLAB, RLAB(*) C CHARACTER*10 TITLE DOUBLE PRECISION R1MACH2, SS, S, SN, VAR, AMINL, AMAXL C C INITIALIZE AND OUTPUT DATA C R1MACH2=10.E30 DO 10 J=1,K IWORK(1,J)=1 WORK(1,J)=0. DO 11 I=1,M WORK(I,J)=R1MACH2 C 10 WORK(I,J)=R1MACH(2) 11 CONTINUE 10 CONTINUE C IF (OUNIT .GT. 0) THEN C WRITE(OUNIT,1) C 1 FORMAT('1') C CALL OUT(1,M,1,X,VLAB,RLAB,TITLE,OUNIT) C ENDIF C C COMPUTE WORK AND IWORK ITERATIVELY C DO 40 I=1,M SS=0. S=0. DO 30 II=1,I III=I-II+1 SS=SS+X(III)**2 S=S+X(III) SN=II VAR=SS-S**2/SN IK=III-1 IF (IK.NE.0) THEN DO 20 J=2,K IF (WORK(I,J).GE.VAR+WORK(IK,J-1))THEN IWORK(I,J)=III WORK(I,J)=VAR+WORK(IK,J-1) ENDIF 20 CONTINUE ENDIF 30 CONTINUE WORK(I,1)=VAR IWORK(I,1)=1 40 CONTINUE C C PRINT RESULTS C C IF (OUNIT .GT. 0) CALL PFISH(M, X, K, DMWORK, WORK, DMIWRK, C * IWORK, OUNIT) C DO 130 J=1,K J=1 JJ=K-J+1 IL=M+1 DO 120 L=1,JJ LL=JJ-L+1 AMINL=R1MACH2 AMAXL=-R1MACH2 S=0. SS=0. IU=IL-1 IL=IWORK(IU,LL) DO 110 II=IL,IU IF(X(II).GE.AMAXL) AMAXL=X(II) IF(X(II).LE.AMINL) AMINL=X(II) S=S+X(II) SS=SS+X(II)**2 110 CONTINUE SN=IU-IL+1 S=S/SN SS=SS/SN-S**2 SS=SQRT(ABS(SS)) LLOUT(L,1)=AMINL LLOUT(L,2)=AMAXL LLOUT(L,3)=S LLOUT(L,4)=SS C WRITE(OUNIT,4) LL,SN,S,SS C 4 FORMAT(I5,5X,3F10.4) 120 CONTINUE C 130 CONTINUE RETURN END classInt/vignettes/0000755000176200001440000000000013642404236014046 5ustar liggesusersclassInt/vignettes/refs_ht.bib0000644000176200001440000000566613637450523016177 0ustar liggesusers@article{Jiang_2013, title={Head/Tail Breaks: A New Classification Scheme for Data with a Heavy-Tailed Distribution}, volume={65}, ISSN={1467-9272}, url={http://dx.doi.org/10.1080/00330124.2012.700499}, DOI={10.1080/00330124.2012.700499}, number={3}, journal={The Professional Geographer}, publisher={Informa UK Limited}, author={Jiang, Bin}, year={2013}, month={Aug}, pages={482-494} } @article{Jiang2_2013, title={Ht-Index for Quantifying the Fractal or Scaling Structure of Geographic Features}, volume={104}, ISSN={1467-8306}, url={http://dx.doi.org/10.1080/00045608.2013.834239}, DOI={10.1080/00045608.2013.834239}, number={3}, journal={Annals of the Association of American Geographers}, publisher={Informa UK Limited}, author={Jiang, Bin and Yin, Junjun}, year={2013}, month={Oct}, pages={530-540} } @article{vasicek2012, author = {Vasicek, Oldrich}, year = {2002}, month = {12}, pages = {160-62}, title = {Loan Portfolio Value}, journal = {Risk} } @book{taleb_black_2008, added-at = {2010-02-22T16:27:58.000+0100}, address = {London}, author = {Taleb, Nassim Nicholas}, biburl = {https://www.bibsonomy.org/bibtex/22d6bb55b10da553cc7822ae5945a2693/vatchoum}, edition = 1, interhash = {f2b3a16f6d0bf3430d44a7508923ac2b}, intrahash = {2d6bb55b10da553cc7822ae5945a2693}, isbn = {1400063515}, keywords = {AleatoireBD AleatoireC {ALIRE,} {HasardBD}}, publisher = {Random House}, timestamp = {2010-02-22T16:44:09.000+0100}, title = {The Black Swan: The Impact of the Highly Improbable}, year = 2008 } @article{Jiang3_2013, title={Scaling of Geographic Space as a Universal Rule for Map Generalization}, volume={103}, ISSN={1467-8306}, url={http://dx.doi.org/10.1080/00045608.2013.765773}, DOI={10.1080/00045608.2013.765773}, number={4}, journal={Annals of the Association of American Geographers}, publisher={Informa UK Limited}, author={Jiang, Bin and Liu, Xintao and Jia, Tao}, year={2013}, month={Jul}, pages={844-855} } @article{Jiang_2019, title={A Recursive Definition of Goodness of Space for Bridging the Concepts of Space and Place for Sustainability}, volume={11}, ISSN={2071-1050}, url={http://dx.doi.org/10.3390/su11154091}, DOI={10.3390/su11154091}, number={15}, journal={Sustainability}, publisher={MDPI AG}, author={Jiang, Bin}, year={2019}, month={Jul}, pages={4091} } @book{Falk_2011, author = {Falk, Michael and Huesler, Juerg and Reiss, Rolf-Dieter}, year = {2011}, month = {01}, pages = {}, title = {Laws of Small Numbers: Extremes and Rare Events}, journal = {Laws of Small Numbers: Extremes and Rare Events}, doi = {10.1007/978-3-0348-0009-9} } @article{Fraga_2009, author = {Fraga Alves, Maria and Haan, L.D. and Neves, Claudia}, year = {2009}, month = {01}, pages = {213-227}, title = {Statistical Inference for heavy and super-heavy tailed distributions}, volume = {139}, journal = {J. Stat. Plan. Inf.} } classInt/vignettes/headtailsR.Rmd0000644000176200001440000005105213637450523016601 0ustar liggesusers--- title: "Head/Tails breaks on the `classInt` package." author: "Diego Hernangomez" date: '`r Sys.Date()`' output: rmarkdown::html_vignette: toc: true number_sections: false toc_depth: 1 bibliography: refs_ht.bib vignette: > %\VignetteIndexEntry{"Head/Tails breaks on the `classInt` package."} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- >*There are far more ordinary people (say, 80 percent) than extraordinary people (say, 20 percent); this is often characterized by the 80/20 principle, based on the observation made by the Italian economist Vilfredo Pareto in 1906 that 80% of land in Italy was owned by 20% of the population. A histogram of the data values for these phenomena would reveal a right-skewed or heavy-tailed distribution. How to map the data with the heavy-tailed distribution?* >
@Jiang_2013
# Abstract This vignette discusses the implementation of the "Head/tail breaks" style (@Jiang_2013) on the `classIntervals` function. A step-by-step example is presented in order to clarify the method. A case study using `spData::afcon` is also included, as well as a test suite checking the performance and validation of the implementation. ```{r setup, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` # Introduction The **Head/tail breaks**, sometimes referred as **ht-index** (@Jiang2_2013), is a classification scheme introduced by @Jiang_2013 in order to find groupings or hierarchy for data with a heavy-tailed distribution. Heavy-tailed distributions are heavily right skewed, with a minority of large values in the head and a majority of small values in the tail. This imbalance between the head and tail, or between many small values and a few large values, can be expressed as *"far more small things than large things"*. Heavy tailed distributions are commonly characterized by a power law, a lognormal or an exponential function. Nature, society, finance (@vasicek2012) and our daily lives are full of rare and extreme events, which are termed "black swan events" (@taleb_black_2008). This line of thinking provides a good reason to reverse our thinking by focusing on low-frequency events. ```{r charheavytail,fig.show='hold'} library(classInt) #1. Characterization of heavy-tail distributions---- set.seed(1234) #Pareto distribution a=1 b=1.161 n=1000 sample_par <- 1 / (1 - runif(1000)) ^ (1 / 1.161) opar <- par(no.readonly = TRUE) par(mar = c(2, 4, 3, 1), cex = 0.8) plot( sort(sample_par, decreasing = TRUE), type = "l", ylab = "F(x)", xlab = "", main = "80/20 principle" ) abline(h = quantile(sample_par, .8) , lty = 2, col = "red3") abline(v = 0.2*length(sample_par) , lty = 2, col = "darkblue") legend( "topleft", legend = c("F(x): p80", "x: Top 20%"), col = c("red3", "darkblue"), lty = 2, cex = 0.8 ) hist( sample_par, n = 100, xlab = "", main = "Histogram", col = "grey50", border = NA, probability = TRUE ) par(opar) ``` # Breaking method The method itself consists on a four-step process performed recursively until a stopping condition is satisfied. Given a vector of values `var` the process can be described as follows: 1. Compute `mu = mean(var)`. 2. Break `var` into the `tail` (as `var < mu`) and the `head` (as `var > mu`). 3. Assess if the proportion of `head` over `var` is lower or equal than a given threshold (i.e. `length(head)/length(var) <= thr`) 4. If 3 is `TRUE`, repeat 1 to 3 until the condition is `FALSE` or no more partitions are possible (i.e. `head` has less than two elements expressed as `length(head) < 2`). It is important to note that, at the beginning of a new iteration, `var` is replaced by `head`. The underlying hypothesis is to create partitions until the head and the tail are balanced in terms of distribution.So the stopping criteria is satisfied when the last head and the last tail are evenly balanced. In terms of threshold, @Jiang3_2013 set 40% as a good approximation, meaning that if the head contains more than 40% of the observations the distribution is not considered heavy-tailed. The final breaks are the vector of consecutive `mu`. # Step by step example We reproduce here the pseudo-code^[The method implemented on `classInt` corresponds to head/tails 1.0 as named on this article.] as per @Jiang_2019: ``` Recursive function Head/tail Breaks: Rank the input data from the largest to the smallest Break the data into the head and the tail around the mean; // the head for those above the mean // the tail for those below the mean While (head <= 40%): Head/tail Breaks (head); End Function ``` A step-by-step example in **R** (for illustrative purposes) has been developed: ```{r stepbystep, fig.show='hold'} opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 3, 1), cex = 0.8) var <- sample_par thr <- .4 brks <- c(min(var), max(var)) #Initialise with min and max sum_table <- data.frame( iter = 0, mu = NA, prop = NA, n_var = NA, n_head = NA ) #Pars for chart limchart <- brks #Iteration for (i in 1:10) { mu <- mean(var) brks <- sort(c(brks, mu)) head <- var[var > mu] prop <- length(head) / length(var) stopit <- prop < thr & length(head) > 1 sum_table = rbind(sum_table, c(i, mu, prop, length(var), length(head))) hist( var, main = paste0("Iter ", i), breaks = 50, col = "grey50", border = NA, xlab = "", xlim = limchart ) abline(v = mu, col = "red3", lty = 2) ylabel <- max(hist(var, breaks = 50, plot = FALSE)$counts) labelplot <- paste0("PropHead: ", round(prop * 100, 2), "%") text( x = mu, y = ylabel, labels = labelplot, cex = 0.8, pos = 4 ) legend( "right", legend = paste0("mu", i), col = c("red3"), lty = 2, cex = 0.8 ) if (isFALSE(stopit)) break var <- head } par(opar) ``` As it can be seen, in each iteration the resulting head gradually loses the high-tail property, until the stopping condition is met. ```{r hiddtable, echo=FALSE} sum_table$mu <- round(sum_table$mu,4) sum_table$prop <- paste0(round(100*sum_table$prop,2),"%") knitr::kable(sum_table[!is.na(sum_table$mu),], row.names = FALSE) ``` The resulting breaks are then defined as `breaks = c(min(var), mu(iter=1), ..., mu(iter), max(var))`. # Implementation on `classInt` package The implementation in the `classIntervals` function should replicate the results: ```{r checkmethod} ht_sample_par <- classIntervals(sample_par, style = "headtails") brks == ht_sample_par$brks print(ht_sample_par) ``` As stated in @Jiang_2013, the number of breaks is naturally determined, however the `thr` parameter could help to adjust the final number. A lower value on `thr` would provide less breaks while a larger `thr` would increase the number, if the underlying distribution follows the *"far more small things than large things"* principle. ```{r examplesimp, fig.show='hold', fig.asp=.7} opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 1), cex = 0.8) pal1 <- c("wheat1", "wheat2", "red3") # Minimum: single break print(classIntervals(sample_par, style = "headtails", thr = 0)) plot( classIntervals(sample_par, style = "headtails", thr = 0), pal = pal1, main = "thr = 0" ) # Two breaks print(classIntervals(sample_par, style = "headtails", thr = 0.2)) plot( classIntervals(sample_par, style = "headtails", thr = 0.2), pal = pal1, main = "thr = 0.2" ) # Default breaks: 0.4 print(classIntervals(sample_par, style = "headtails")) plot(classIntervals(sample_par, style = "headtails"), pal = pal1, main = "thr = Default") # Maximum breaks print(classIntervals(sample_par, style = "headtails", thr = 1)) plot( classIntervals(sample_par, style = "headtails", thr = 1), pal = pal1, main = "thr = 1" ) par(opar) ``` The method always returns at least one break, corresponding to `mean(var)`. # Case study @Jiang_2013 states that "the new classification scheme is more natural than the natural breaks in finding the groupings or hierarchy for data with a heavy-tailed distribution." (p. 482), referring to Jenks' natural breaks method. In this case study we would compare "headtails" vs. "fisher", that is the alias for the Fisher-Jenks algorithm and it is always preferred to the "jenks" style (see `?classIntervals`). For this example we will use the `afcon` dataset from `spData` package. ```{r loadspdata, message=FALSE} library(spData) data(afcon, package = "spData") ``` Let's have a look to the Top 10 values and the distribution of the variable `totcon` (index of total conflict 1966-78): ```{r summspdata, fig.show='hold'} # Top10 knitr::kable(head(afcon[order(afcon$totcon, decreasing = TRUE),c("name","totcon")],10)) opar <- par(no.readonly = TRUE) par(mar = c(4, 4, 3, 1), cex = 0.8) hist(afcon$totcon, n = 20, main = "Histogram", xlab = "totcon", col = "grey50", border = NA, ) plot( density(afcon$totcon), main = "Distribution", xlab = "totcon", ) par(opar) ``` The data shows that EG and SU data present a clear hierarchy over the rest of values. As per the histogram, we can confirm a heavy-tailed distribution and therefore the *"far more small things than large things"* principle. As a testing proof, on top of "headtails" and "fisher" we would use also "quantile" to have a broader view on the different breaking styles. As "quantile" is a position-based metric, it doesn't account for the magnitude of F(x) (hierarchy), so the breaks are solely defined by the position of x on the distribution. Applying the three aforementioned methods to break the data: ```{r breaksample,fig.show='hold'} brks_ht <- classIntervals(afcon$totcon, style = "headtails") print(brks_ht) #Same number of classes for "fisher" nclass <- length(brks_ht$brks) - 1 brks_fisher <- classIntervals(afcon$totcon, style = "fisher", n = nclass) print(brks_fisher) brks_quantile <- classIntervals(afcon$totcon, style = "quantile", n = nclass) print(brks_quantile) pal1 <- c("wheat1", "wheat2", "red3") opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 1), cex = 0.8) plot(brks_ht, pal = pal1, main = "headtails") plot(brks_fisher, pal = pal1, main = "fisher") plot(brks_quantile, pal = pal1, main = "quantile") par(opar) ``` It is observed that the top three classes of "headtails" enclose 5 observations, whereas "fisher" includes 13 observations. In terms of classification, "headtails" breaks focuses more on extreme values. The next plot compares a continuous distribution of `totcon` re-escalated to a range of `[1,nclass]` versus the distribution across breaks for each style. The continuous distribution has been offset by -0.5 in order to align the continuous and the discrete distributions. ```{r benchmarkbreaks, fig.show='hold', fig.width=7} #Helper function to reescale values help_reescale <- function(x, min = 1, max = 10) { r <- (x - min(x)) / (max(x) - min(x)) r <- r * (max - min) + min return(r) } afcon$ecdf_class <- help_reescale(afcon$totcon, min = 1 - 0.5, max = nclass - 0.5) afcon$ht_breaks <- cut(afcon$totcon, brks_ht$brks, labels = FALSE, include.lowest = TRUE) afcon$fisher_breaks <- cut(afcon$totcon, brks_fisher$brks, labels = FALSE, include.lowest = TRUE) afcon$quantile_break <- cut(afcon$totcon, brks_quantile$brks, labels = FALSE, include.lowest = TRUE) opar <- par(no.readonly = TRUE) par(mar = c(4, 4, 1, 1), cex = 0.8) plot( density(afcon$ecdf_class), ylim = c(0, 0.8), lwd = 2, main = "", xlab = "class" ) lines(density(afcon$ht_breaks), col = "darkblue", lty = 2) lines(density(afcon$fisher_breaks), col = "limegreen", lty = 2) lines(density(afcon$quantile_break), col = "red3", lty = 2) legend("topright", legend = c("Continuous", "headtails", "fisher", "quantile"), col = c("black", "darkblue", "limegreen", "red3"), lwd = c(2, 1, 1, 1), lty = c(1, 2, 2, 2), cex = 0.8 ) par(opar) ``` It can be observed that the distribution of "headtails" breaks is also heavy-tailed, and closer to the original distribution. On the other extreme, "quantile" provides a quasi-uniform distribution, ignoring the `totcon` hierarchy In terms of data visualization, we compare here the final map using the techniques mentioned above. On this plotting exercise: - `cex` of points are always between `1` and `5`. - For the continuous approach, no classes are provided. This plot will be used as the reference. - For all the rest of styles, `col` and `cex` on each point is defined as per the class of that point. ```{r finalplot , fig.show='hold', fig.asp=1.2} custompal <- c("#FE9F6D99", "#DE496899", "#8C298199", "#3B0F7099", "#00000499") afcon$cex_points <- help_reescale(afcon$totcon, min = 1, max = 5) opar <- par(no.readonly = TRUE) par(mar = c(1.5, 1.5, 2, 1.5), cex = 0.8) # Plot continuous plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$cex_points, pch = 20, col = "grey50", main = "Continuous", ) mcont <- (max(afcon$totcon) - min(afcon$totcon)) / 4 legcont <- 1:5 * mcont - (mcont - min(afcon$totcon)) legend("bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(legcont, 0) ), col = "grey50", pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$ht_breaks, pch = 20, col = custompal[afcon$ht_breaks], main = "headtails" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_ht$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$fisher_breaks, pch = 20, col = custompal[afcon$fisher_breaks], main = "fisher" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_fisher$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$quantile_break, pch = 20, col = custompal[afcon$quantile_break], main = "quantile" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_quantile$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() par(opar) ``` As per the results, "headtails" seems to provide a better understanding of the most extreme values when the result is compared against the continuous plot. The "quantile" style, as expected, just provides a clustering without taking into account the real hierarchy. The "fisher" plot is in-between of these two interpretations. It is also important to note that "headtails" and "fisher" reveal different information that can be useful depending of the context. While "headtails" highlights the outliers, it fails on providing a good clustering on the tail, while "fisher" seems to reflect better these patterns. This can be observed on the values of Western Africa and the Niger River Basin, where "headtails" doesn't highlight any special cluster of conflicts, "fisher" suggests a potential cluster. This can be confirmed on the histogram generated previously, where a concentration of `totcon` around 1,000 is visible. # Testing and benchmark On this section the performance of the "headtails" implementation is tested, in terms of speed and handling of corner cases. A small benchmark with another styles is also presented. Testing has been performed over the following distributions: **Heavy-tailed distributions** - Pareto - Exponential - Log-normal - Weibull - Log-Cauchy, also known as super-heavy tail distribution (@Falk_2011, p. 80, @Fraga_2009) **Non heavy-tailed distributions** - Normal (non heavy-tailed) - Truncated Normal (left-tailed) - Uniform distribution ```{r distest, fig.show='hold'} #Init samples set.seed(2389) #Pareto distributions a=7 b=14 paretodist <- 7 / (1 - runif(5000000)) ^ (1 / 14) #Exponential dist expdist <- rexp(5000000) #Lognorm lognormdist <- rlnorm(5000000) #Weibull weibulldist <- rweibull(5000000, 1, scale = 5) #LogCauchy "super-heavy tail" logcauchdist <- exp(rcauchy(5000000, 2, 4)) #Remove Inf logcauchdist <- logcauchdist[logcauchdist < Inf] #Normal dist normdist <- rnorm(5000000) #Left-tailed distr leftnorm <- sample(rep(normdist[normdist < mean(normdist)], 3), size = 5000000) #Uniform distribution unifdist <- runif(5000000) ``` Let's define a helper function and proceed to run the whole test suite: ```{r testresults, fig.show='hold'} testresults <- data.frame( Title = NA, style = NA, nsample = NA, thresold = NA, nbreaks = NA, time_secs = NA ) benchmarkdist <- function(dist, style = "headtails", thr = 0.4, title = "", plot = FALSE) { init <- Sys.time() br <- classIntervals(dist, style = style, thr = thr) a <- Sys.time() - init test <- data.frame( Title = title, style = style, nsample = format(length(br$var), scientific = FALSE, big.mark = ","), thresold = thr, nbreaks = length(br$brks) - 1, time_secs = as.character(round(a,4)) ) testresults <- unique(rbind(testresults, test)) if (plot) { plot( density(br$var, from = quantile(dist,.0005), to = quantile(dist,.9995) ), col = "black", cex.main = .9, main = paste0( title, " ", style, ", thr =", thr, ", nbreaks = ", length(br$brks) - 1 ), ylab = "", xlab = "" ) abline(v = br$brks, col = "red3", lty = 2) } return(testresults) } opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 2), cex = 0.8) # Pareto---- testresults <- benchmarkdist(paretodist, title = "Pareto", plot = TRUE) testresults <- benchmarkdist(paretodist, title = "Pareto", thr = 0) testresults <- benchmarkdist(paretodist, title = "Pareto", thr = .75, plot = TRUE) #Sample 2,000 obs set.seed(1234) Paretosamp <- sample(paretodist, 2000) testresults <- benchmarkdist(Paretosamp, title = "Pareto sample", style = "fisher", plot = TRUE) testresults <- benchmarkdist(Paretosamp, title = "Pareto sample", style = "headtails", plot = TRUE) #Exponential---- testresults <- benchmarkdist(expdist, title = "Exponential", plot = TRUE) testresults <- benchmarkdist(expdist, title = "Exponential", thr = 0) testresults <- benchmarkdist(expdist, title = "Exponential", thr = 1) testresults <- benchmarkdist(expdist, title = "Exponential", style = "quantile", plot = TRUE) #Weibull----- testresults <- benchmarkdist(weibulldist, title = "Weibull", plot = TRUE) testresults <- benchmarkdist(weibulldist, title = "Weibull", thr = 0) testresults <- benchmarkdist(weibulldist, title = "Weibull", thr = 1) #Logcauchy testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", plot = TRUE) testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", thr = 0) testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", thr = 1) #Normal---- testresults <- benchmarkdist(normdist, title = "Normal", plot = TRUE) testresults <- benchmarkdist(normdist, title = "Normal", thr = 0) testresults <- benchmarkdist(normdist, title = "Normal", thr = 1, plot = TRUE) #Truncated Left-tail Normal---- testresults <- benchmarkdist(leftnorm, title = "Left Normal", plot = TRUE) testresults <- benchmarkdist(leftnorm, title = "Left Normal", thr = -100) testresults <- benchmarkdist(leftnorm, title = "Left Normal", plot = TRUE, thr = 500) #Uniform---- testresults <- benchmarkdist(unifdist, title = "Uniform", plot = TRUE, thr = 0.7) testresults <- benchmarkdist(unifdist, title = "Uniform", thr = 0) testresults <- benchmarkdist(unifdist, title = "Uniform", plot = TRUE, thr = 1) par(opar) # Results knitr::kable(testresults[-1, ], row.names = FALSE) ``` The implementation works as expected, with a good performance given the size of the sample, and also compares well with another current implementations on `classIntervals`. # References classInt/R/0000755000176200001440000000000013642343601012235 5ustar liggesusersclassInt/R/classInt.R0000644000176200001440000005141413635361327014154 0ustar liggesusersgvf <- function(var, cols) { sumsq <- function(x) sum((x - mean(x))^2) sdam <- sumsq(var) sdcm <- sum(tapply(var, factor(cols), sumsq)) res <- 1 - (sdcm/sdam) res } tai <- function(var, cols) { sumabs <- function(x) sum(abs(x - mean(x))) x <- sumabs(var) y <- sum(tapply(var, factor(cols), sumabs)) res <- 1 - (y/x) res } oai <- function(var, cols, area) { sumabs1 <- function(x) sum(abs(x[,1] - mean(x[,1]))*x[,2]) m <- cbind(as.numeric(var), as.numeric(area)) x <- sumabs1(m) y <- sum(by(m, factor(cols), sumabs1)) res <- 1 - (y/x) res } jenks.tests <- function(clI, area) { if (class(clI) != "classIntervals") stop("Class interval object required") cols <- findCols(clI) res <- c("# classes"=length(clI$brks)-1, "Goodness of fit"=gvf(clI$var, cols), "Tabular accuracy"=tai(clI$var, cols)) if (!missing(area)) { if (length(area) != length(cols)) stop("area and classified variable different lengths") res <- c(res, "Overview accuracy"=oai(clI$var, cols, area)) } res } plot.classIntervals <- function(x, pal, ...) { if (class(x) != "classIntervals") stop("Class interval object required") if (length(pal) < 2) stop("pal must contain at least two colours") pal_out <- colorRampPalette(pal)(length(x$brks)-1) plot(ecdf(x$var), ...) stbrks <- cbind(x$brks[-length(x$brks)], x$brks[-1]) abline(v=x$brks, col="grey") for (i in 1:nrow(stbrks)) rect(stbrks[i,1], par("usr")[3], stbrks[i,2], 0, col=pal_out[i], border="transparent") } classIntervals2shingle <- function(x) { res <- x$var nl <- length(x$brks) - 1 lres <- vector(mode="list", length=nl) for (i in 1:nl) lres[[i]] <- x$brks[c(i, i+1)] class(lres) <- "shingleLevel" attr(res, "levels") <- lres class(res) <- "shingle" res } # change contributed by Richard Dunlap 090512 # Added intervalClosure argument to allow specification of whether # partition intervals are closed on the left or the right # Added dataPrecision argument to allow rounding of interval boundaries # to the precision -- the argument equals the number of # decimal places in the data. Negative numbers retain the usual # convention for rounding. classIntervals <- function(var, n, style="quantile", rtimes=3, ..., intervalClosure=c("left", "right"), dataPrecision=NULL, warnSmallN=TRUE, warnLargeN = TRUE, largeN = 3000L, samp_prop = 0.1, gr=c("[", "]")) { if (is.factor(var)) stop("var is categorical") # https://github.com/r-spatial/classInt/issues/8 TZ <- NULL POSIX <- FALSE DATE <- FALSE if (!is.numeric(var)) { if (inherits(var, "POSIXt")) { TZ <- attr(var, "tzone") POSIX <- TRUE var <- unclass(as.POSIXct(var)) } else if (inherits(var, "Date")) { var <- unclass(var) DATE <- TRUE } else { stop("var is not numeric") } } UNITS <- NULL if (inherits(var, "units")) { UNITS <- paste0(gr[1], as.character(attr(var, "units")), gr[2]) } # Matthieu Stigler 120705 intervalClosure <- match.arg(intervalClosure) ovar <- var if (length(style) > 1L) style <- style[1L] if (any(is.na(var))) { warning("var has missing values, omitted in finding classes") var <- c(na.omit(var)) } if (any(!is.finite(var))) { warning("var has infinite values, omitted in finding classes") is.na(var) <- !is.finite(var) } nobs <- length(unique(var)) if (nobs == 1) stop("single unique value") # Fix 22: Diego Hernangómez needn <- !(style %in% c("dpih", "headtails")) if (missing(n)) n <- nclass.Sturges(var) if (n < 2 & needn) stop("n less than 2") n <- as.integer(n) pars <- NULL if (n > nobs & needn) { if (warnSmallN) { warning(paste("n greater than number of different finite values", "n reset to number of different finite values", sep="\\n")) } n <- nobs } if (n == nobs & needn) { if (warnSmallN) { warning(paste("n same as number of different finite values", "each different finite value is a separate class", sep="\\n")) } sVar <- sort(unique(var)) dsVar <- diff(sVar) brks <- c(sVar[1]-(mean(dsVar)/2), sVar[1:(length(sVar)-1)]+(dsVar/2), sVar[length(sVar)]+(mean(dsVar)/2)) style="unique" } else { # introduced related to https://github.com/r-spatial/classInt/issues/7 sampling <- FALSE if (warnLargeN && (style %in% c("kmeans", "hclust", "bclust", "fisher", "jenks"))) { if (nobs > largeN) { warning("N is large, and some styles will run very slowly; sampling imposed") sampling <- TRUE nsamp <- ifelse(samp_prop*nobs > 3000, as.integer(ceiling(samp_prop*nobs)), 3000L) } } if (style =="fixed") { # mc <- match.call(expand.dots=FALSE) # fixedBreaks <- sort(eval(mc$...$fixedBreaks)) # Matthieu Stigler 111110 dots <- list(...) fixedBreaks <- sort(dots$fixedBreaks) if (is.null(fixedBreaks)) stop("fixed method requires fixedBreaks argument") # if (length(fixedBreaks) != (n+1)) # stop("mismatch between fixedBreaks and n") if (!is.numeric(fixedBreaks)) { # fixedBreaks assumed to be TZ-compliant with var if (inherits(fixedBreaks, "POSIXt") && POSIX) { fixedBreaks <- unclass(as.POSIXct(fixedBreaks)) } else if (inherits(fixedBreaks, "DATE") && DATE) { fixedBreaks <- unclass(fixedBreaks) } else { stop("fixedBreaks must be numeric") } } if (any(diff(fixedBreaks) < 0)) stop("decreasing fixedBreaks found") if (min(var) < fixedBreaks[1] || max(var) > fixedBreaks[length(fixedBreaks)]) warning("variable range greater than fixedBreaks") brks <- fixedBreaks } else if (style =="sd") { svar <- scale(var) pars <- c(attr(svar, "scaled:center"), attr(svar, "scaled:scale")) names(pars) <- c("center", "scale") sbrks <- pretty(x=svar, n=n, ...) brks <- c((sbrks * pars[2]) + pars[1]) } else if (style =="equal") { brks <- seq(min(var), max(var), length.out=(n+1)) } else if (style =="pretty") { brks <- c(pretty(x=var, n=n, ...)) } else if (style =="quantile") { # stats brks <- c(quantile(x=var, probs=seq(0,1,1/n), ...)) names(brks) <- NULL } else if (style =="kmeans") { # stats pars <- try(kmeans(x=var, centers=n, ...)) if (class(pars) == "try-error") { warning("jittering in kmeans") jvar <- jitter(rep(x=var, times=rtimes)) pars <- try(kmeans(x=jvar, centers=n, ...)) if (class(pars) == "try-error") stop("kmeans failed after jittering") else { cols <- match(pars$cluster, order(c(pars$centers))) rbrks <- unlist(tapply(jvar, factor(cols), range)) } } else { cols <- match(pars$cluster, order(c(pars$centers))) rbrks <- unlist(tapply(var, factor(cols), range)) } names(rbrks) <- NULL brks <- .rbrks(rbrks) } else if (style =="hclust") { # stats pars <- hclust(dist(x=var, method="euclidean"), ...) rcluster <- cutree(tree=pars, k=n) rcenters <- unlist(tapply(var, factor(rcluster), mean)) cols <- match(rcluster, order(c(rcenters))) rbrks <- unlist(tapply(var, factor(cols), range)) names(rbrks) <- NULL brks <- .rbrks(rbrks) } else if (style =="bclust") { # e1071, class pars <- try(bclust(x=var, centers=n, ...)) if (class(pars) == "try-error") { warning("jittering in bclust") jvar <- jitter(rep(x=var, times=rtimes)) pars <- try(bclust(x=jvar, centers=n, ...)) if (class(pars) == "try-error") stop("bclust failed after jittering") else { cols <- match(pars$cluster, order(c(pars$centers))) rbrks <- unlist(tapply(jvar, factor(cols), range)) } } else { cols <- match(pars$cluster, order(c(pars$centers))) rbrks <- unlist(tapply(var, factor(cols), range)) } names(rbrks) <- NULL brks <- .rbrks(rbrks) } else if (style =="fisher") { # introduced related to https://github.com/r-spatial/classInt/issues/7 if (sampling) { pars <- fish(x=c(range(var), sample(x=var, size=nsamp)), k=n) } else { pars <- fish(x=var, k=n) } brks <- pars[n,1] for (i in n:1) brks <- c(brks, (pars[i,2]+pars[(i-1),1])/2) brks <- c(brks, pars[1,2]) colnames(pars) <- c("min", "max", "class mean", "class sd") } else if (style == "jenks") { # Jenks Optimisation Method # change contributed by Richard Dunlap 090512 # This version of the Jenks code assumes intervals are closed on # the right -- force it. intervalClosure = "right" if (storage.mode(var) != "double") storage.mode(var) <- "double" # introduced related to https://github.com/r-spatial/classInt/issues/7 if (sampling) { message("Use \"fisher\" instead of \"jenks\" for larger data sets") d <- sort(c(range(var), sample(x=var, size=nsamp))) } else { d <- sort(var) } k <- n #work<-matrix(0,k,length(d)) mat1 <- matrix(1, length(d), k) mat2 <- matrix(0, length(d), k) mat2[2:length(d),1:k] <- .Machine$double.xmax #R's max double value? v<-0 for(l in 2:length(d)){ s1=s2=w=0 for(m in 1:l){ i3 <- l - m + 1 val <- d[i3] s2 <- s2 + val * val s1 <- s1 + val w<-w+1 v <- s2 - (s1 * s1) / w i4 <- trunc(i3 - 1) if(i4 !=0){ for(j in 2:k){ if(mat2[l,j] >= (v + mat2[i4, j - 1])){ mat1[l,j] <- i3 mat2[l,j] <- v + mat2[i4, j - 1] } } } } mat1[l,1] <- 1 mat2[l,1] <- v } kclass<-1:k kclass[k] <- length(d) k <- length(d) last<-length(d) for(j in length(kclass):1){ id <- trunc(mat1[k,j]) - 1 kclass[j - 1] <- id k <- id #lower last <- k -1 #upper } # change uncontributed by Richard Dunlap 090512 # with the specification of intervalClosure for the presentation layer, # don't need to change this brks<-d[c(1, kclass)] } else if (style == "dpih") { # introduced related to https://github.com/r-spatial/classInt/issues/6 h <- dpih(var, ...) dots <- list(...) if (!is.null(dots$range.x)) { vmin <- dots$range.x[1] vmax <- dots$range.x[2] } else { vmin <- min(var) vmax <- max(var) } brks <- seq(vmin, vmax, by=h) } else if (style == "headtails") { # Contributed Diego Hernangómez dots <- list(...) thr <- ifelse(is.null(dots$thr), .4, dots$thr) thr <- min(1,max(0, thr)) head <- var breaks <- min(var, na.rm = TRUE) #Init with minimum for (i in 1:100) { mu <- mean(head, na.rm = TRUE) breaks <- c(breaks, mu) ntot <- length(head) #Switch head head <- head[head > mu] prop <- length(head) / ntot keepiter <- prop <= thr & length(head) > 1 if (isFALSE(keepiter)) {break} } #Add max to complete intervals brks <- sort(unique(c(breaks, max(var, na.rm = TRUE)))) } else stop(paste(style, "unknown")) } if (is.null(brks)) stop("Null breaks") if (POSIX) { ovar <- .POSIXct(ovar, TZ) brks <- .POSIXct(brks, TZ) } else if (DATE) { ovar <- as.Date(ovar, origin = "1970-01-01") brks <- as.Date(brks, origin = "1970-01-01") } res <- list(var=ovar, brks=brks) attr(res, "style") <- style attr(res, "parameters") <- pars attr(res, "nobs") <- nobs attr(res, "call") <- match.call() # change contributed by Richard Dunlap 090512 # Add intervalClosure and dataPrecision to the attributes so they're # available to the print method attr(res, "intervalClosure") <- intervalClosure attr(res, "dataPrecision") <- dataPrecision attr(res, "var_units") <- UNITS class(res) <- "classIntervals" res } .rbrks <- function(rbrks) { nb <- length(rbrks) if (nb < 2) stop("single break") brks <- c(rbrks[1], rbrks[nb]) if (nb > 2) { if (nb == 3) brks <- append(brks, rbrks[2], 1) else { ins <- NULL for (i in as.integer(seq(2,(nb-2),2))) { ins <- c(ins, ((rbrks[i]+rbrks[i+1])/2)) } brks <- append(brks, ins, 1) } } brks } findColours <- function(clI, pal, under="under", over="over", between="-", digits = getOption("digits"), cutlabels=TRUE) { if (class(clI) != "classIntervals") stop("Class interval object required") if (is.null(clI$brks)) stop("Null breaks") if (length(pal) < 2) stop("pal must contain at least two colours") cols <- findCols(clI) palette <- colorRampPalette(pal)(length(clI$brks)-1) res <- palette[cols] attr(res, "palette") <- palette tab <- tableClassIntervals(cols=cols, brks=clI$brks, under=under, over=over, between=between, digits=digits, cutlabels=cutlabels, intervalClosure=attr(clI, "intervalClosure"), dataPrecision=attr(clI, "dataPrecision")) attr(res, "table") <- tab res } # change contributed by Richard Dunlap 090512 # Looks for intervalClosure attribute to allow specification of # whether partition intervals are closed on the left or the right findCols <- function(clI) { if (class(clI) != "classIntervals") stop("Class interval object required") if (is.null(clI$brks)) stop("Null breaks") if (is.null(attr(clI, "intervalClosure")) || (attr(clI, "intervalClosure") == "left")) { cols <- findInterval(clI$var, clI$brks, all.inside=TRUE) } else { cols <- apply(array(apply(outer(clI$var, clI$brks, ">"), 1, sum)), 1, max, 1) } cols } # change contributed by Richard Dunlap 090512 # Added intervalClosure argument to allow specification of whether # partition intervals are closed on the left or the right # Added dataPrecision for rounding of the interval endpoints tableClassIntervals <- function(cols, brks, under="under", over="over", between="-", digits = getOption("digits"), cutlabels=TRUE, intervalClosure=c("left", "right"), dataPrecision=NULL, unique=FALSE, var) { # Matthieu Stigler 120705 unique # Matthieu Stigler 120705 intervalClosure <- match.arg(intervalClosure) lx <- length(brks) nres <- character(lx - 1) sep <- " " if (cutlabels) { sep <- "" between="," } if (is.null(intervalClosure) || (intervalClosure=="left")) { left = "[" right = ")" } else { left = "(" right = "]" } #The two global endpoints are going through roundEndpoint to get # formatting right, nothing more if (cutlabels) nres[1] <- paste("[", roundEndpoint(brks[1], intervalClosure, dataPrecision), between, roundEndpoint(brks[2], intervalClosure, dataPrecision), right, sep=sep) else nres[1] <- paste(under, roundEndpoint(brks[2], intervalClosure, dataPrecision), sep=sep) for (i in 2:(lx - 2)) { if (cutlabels) nres[i] <- paste(left, roundEndpoint(brks[i], intervalClosure, dataPrecision), between, roundEndpoint(brks[i + 1], intervalClosure, dataPrecision), right, sep=sep) else nres[i] <- paste(roundEndpoint(brks[i], intervalClosure, dataPrecision), between, roundEndpoint(brks[i + 1], intervalClosure, dataPrecision), sep=sep) } if (cutlabels) nres[lx - 1] <- paste(left, roundEndpoint(brks[lx - 1], intervalClosure, dataPrecision), between, roundEndpoint(brks[lx], intervalClosure, dataPrecision), "]", sep=sep) else nres[lx - 1] <- paste(over, roundEndpoint(brks[lx - 1], intervalClosure, dataPrecision), sep=sep) tab <- table(factor(cols, levels=1:(lx - 1))) names(tab) <- nres # Matthieu Stigler 120705 unique ## Assign unique label for intervals containing same left-right points if(unique&!missing(var)){ tab_unique<-tapply(var, cols, function(x) length(unique(x))) # tab_unique_vals<-tapply(var, cols, function(x) length(unique(x))) if(any(tab_unique==1)){ # w.unique <-which(tab_unique==1) w.unique <-as.numeric(names(which(tab_unique==1))) cat("Class found with one single (possibly repeated) value: changed label\n") # cols.unique <-cols%in%names(w.unique) cols.unique <-cols%in%w.unique names(tab)[w.unique] <- tapply(var[cols.unique ], cols[cols.unique ], function(x) if(is.null(dataPrecision)) unique(x) else round(unique(x), dataPrecision)) } } tab } # change contributed by Richard Dunlap 090512 # New helper method for tableClassIntervals roundEndpoint <- function(x, intervalClosure=c("left", "right"), dataPrecision) { # Matthieu Stigler 120705 intervalClosure <- match.arg(intervalClosure) if (is.null(dataPrecision)) { retval <- x } else if (is.null(intervalClosure) || (intervalClosure=="left")) { retval <- ceiling(x * 10^dataPrecision) / 10^dataPrecision } else { retval <- floor(x * 10^dataPrecision) / 10^dataPrecision } digits = getOption("digits") format(retval, digits=digits, trim=TRUE) } #FIXME output trailing zeros in decimals print.classIntervals <- function(x, digits = getOption("digits"), ..., under="under", over="over", between="-", cutlabels=TRUE, unique=FALSE) { if (class(x) != "classIntervals") stop("Class interval object required") cat("style: ", attr(x, "style"), "\n", sep="") UNITS <- attr(x, "var_units") if (is.null(UNITS)) UNITS <- "" else UNITS <- paste0(UNITS, " ") nP <- nPartitions(x) if (is.finite(nP)) cat(" one of ", prettyNum(nP, big.mark = ","), " possible partitions of this ", UNITS, "variable into ", length(x$brks)-1, " classes\n", sep="") cols <- findCols(x) nvar <- x$var if (inherits(nvar, "units")) attributes(nvar) <- NULL nbrks <- x$brks if (inherits(nbrks, "units")) attributes(nbrks) <- NULL # change contributed by Richard Dunlap 090512 # passes the intervalClosure argument to tableClassIntervals tab <- tableClassIntervals(cols=cols, brks=nbrks, under=under, over=over, between=between, digits=digits, cutlabels=cutlabels, intervalClosure=attr(x, "intervalClosure"), dataPrecision=attr(x, "dataPrecision"), unique=unique, nvar) print(tab, digits=digits, ...) invisible(tab) } nPartitions <- function(x) { n <- attr(x, "nobs") if (n > 170) ret <- Inf else { k <- length(x$brks)-1 ret <- (factorial(n - 1))/(factorial(n - k) * factorial(k - 1)) } ret } getBclustClassIntervals <- function(clI, k) { if (class(clI) != "classIntervals") stop("Class interval object required") if (missing(k)) k <- length(clI$brks)-1 if (class(attr(clI, "parameters")) != "bclust") stop("Class interval object not made with style=\"bclust\"") ovar <- clI$var var <- clI$var if (any(!is.finite(var))) is.na(var) <- !is.finite(var) var <- c(na.omit(var)) obj <- attr(clI, "parameters") cols <- match(clusters.bclust(obj, k=k), order(centers.bclust(obj, k=k))) rbrks <- unlist(tapply(var, factor(cols), range)) names(rbrks) <- NULL brks <- .rbrks(rbrks) res <- list(var=ovar, brks=brks) attr(res, "style") <- attr(clI, "style") attr(res, "parameters") <- attr(clI, "parameters") attr(res, "nobs") <- attr(clI, "nobs") attr(res, "call") <- attr(clI, "call") attr(res, "modified") <- c(attr(clI, "modified"), k) class(res) <- "classIntervals" res } getHclustClassIntervals <- function(clI, k) { if (class(clI) != "classIntervals") stop("Class interval object required") if (missing(k)) k <- length(clI$brks)-1 if (class(attr(clI, "parameters")) != "hclust") stop("Class interval object not made with style=\"hclust\"") ovar <- clI$var var <- clI$var if (any(!is.finite(var))) is.na(var) <- !is.finite(var) var <- c(na.omit(var)) obj <- attr(clI, "parameters") rcluster <- cutree(tree=obj, k=k) rcenters <- unlist(tapply(var, factor(rcluster), mean)) cols <- match(rcluster, order(c(rcenters))) rbrks <- unlist(tapply(var, factor(cols), range)) names(rbrks) <- NULL brks <- .rbrks(rbrks) res <- list(var=ovar, brks=brks) attr(res, "style") <- attr(clI, "style") attr(res, "parameters") <- attr(clI, "parameters") attr(res, "nobs") <- attr(clI, "nobs") attr(res, "call") <- attr(clI, "call") attr(res, "modified") <- c(attr(clI, "modified"), k) class(res) <- "classIntervals" res } fish <- function(x, k) { x <- sort(x) m <- length(x) k <- as.integer(k) work <- double(m*k) iwork <- integer(m*k) res <- double(k*4) out <- .Fortran("fish", as.integer(m), as.double(x), as.integer(k), as.integer(m), as.double(work), as.integer(m), as.integer(iwork), as.double(res), PACKAGE="classInt")[[8]] out <- matrix(out, k, 4) out } classInt/R/logLik.R0000644000176200001440000000727513552415607013622 0ustar liggesusers#' Log-likelihood for classIntervals objects #' #' @details #' #' Generally, the likelihood is a method for minimizing the standard deviation #' within an interval, and with the AIC, a per-interval penalty can be used to #' maximize the information and self-similarity of data in the interval. #' #' Based on Birge 2006 and Davies 2009 (see references), interval binning #' selections may be compared by likelihood to optimize the number of intervals #' selected for a set of data. The `logLik()` function (and associated `AIC()` #' function) can be used to optimize binning by maximizing the likelihood across #' choices of intervals. #' #' As illustrated by the examples below (the AIC comparison does not #' specifically select 3 intervals when comparing 2, 3, and 4 intervals for data #' with 3 intervals), while likelihood-based methods can provide evidence toward #' optimization of binning, they are not infallible for bin selection. #' #' @param object A classIntervals object #' @param ... Ignored. #' @return A `logLik` object (see `stats::logLik`). #' @examples #' x <- classIntervals(rnorm(100), n=5, style="fisher") #' logLik(x) #' AIC(x) # By having a logLik method, AIC.default is used. #' #' # When the intervals are made of a limited number of discrete values, the #' # logLik is zero by definition (the standard deviation is zero giving a dirac #' # function at the discrete value indicating a density of 1 and a log-density #' # of zero). #' x <- classIntervals(rep(1:2, each=10), n=2, style="jenks") #' logLik(x) #' x <- classIntervals(rep(1:3, each=10), n=2, style="jenks") #' logLik(x) #' #' # With slight jitter but notable categorical intervals (at 1, 2, and 3), the #' # AIC will make selection of the optimal intervals easier. #' data <- rep(1:3, each=100) + runif(n=300, min=-0.01, max=0.01) #' x_2 <- classIntervals(data, n=2, style="jenks") #' x_3 <- classIntervals(data, n=3, style="jenks") #' x_4 <- classIntervals(data, n=4, style="jenks") #' AIC(x_2, x_3, x_4) #' @references #' Lucien Birge, Yves Rozenholc. How many bins should be put in a regular #' histogram. ESAIM: Probability and Statistics. 31 January 2006. 10:24-45. #' url: https://www.esaim-ps.org/articles/ps/abs/2006/01/ps0322/ps0322.html. #' doi:10.1051/ps:2006001 #' #' Laurie Davies, Ursula Gather, Dan Nordman, Henrike Weinert. A comparison of #' automatic histogram constructions. ESAIM: Probability and Statistics. 11 #' June 2009. 13:181-196. url: #' https://www.esaim-ps.org/articles/ps/abs/2009/01/ps0721/ps0721.html #' doi:10.1051/ps:2008005 #' @export logLik.classIntervals <- function(object, ...) { df <- length(object$brks) - 1 current_loglik <- 0 for (idx in seq_len(df)) { mask_current <- if (((idx == 1) & (attr(object, "intervalClosure") == "right")) | ((idx == df) & (attr(object, "intervalClosure") == "left"))) { object$brks[idx] <= object$var & object$var <= object$brks[idx + 1] } else if (attr(object, "intervalClosure") == "right") { object$brks[idx] < object$var & object$var <= object$brks[idx + 1] } else if (attr(object, "intervalClosure") == "left") { object$brks[idx] <= object$var & object$var < object$brks[idx + 1] } if (sum(mask_current)) { current_x <- object$var[mask_current] current_loglik <- current_loglik + if (length(unique(current_x)) == 1) { # Assume that the density is 1 at the unique value's location and zero # elsewhere. Therefore the log-density is 0. 0 } else { sum(dnorm(x=current_x, mean=mean(current_x), sd=sd(current_x), log=TRUE)) } } } structure(current_loglik, df=df, nobs=length(object$var), class="logLik") } classInt/MD50000644000176200001440000000221313643057612012347 0ustar liggesusers6d598c7e05ac2dc24cc589516414b104 *ChangeLog 2bce92c22883cbce3e45569a6864b5be *DESCRIPTION 977913b7a28af161af142d69a85c7040 *NAMESPACE 8c8ca3a60dad708db67face920d9013f *R/classInt.R eb6dcb10dcbdf12158501121914cf2d2 *R/logLik.R 81ca160e18b72bbb9c0b2e8a1e7e9171 *build/vignette.rds 6d598c7e05ac2dc24cc589516414b104 *inst/ChangeLog 3e85b430e7bb48716f9a0dc3f266af8e *inst/doc/headtailsR.R bda2bf62a3c4f367d9af16a83800f666 *inst/doc/headtailsR.Rmd 94e309622bd816552833f809accd1bee *inst/doc/headtailsR.html 6c88138c61e0b9eb9388c84b03ad8a22 *man/classIntervals.Rd 38caf6804bf24cc541ee5c7e0c2a618a *man/findColours.Rd d93424391cc83b397a77bf71cf05667b *man/findCols.Rd f4d945d8c12bd18bfe0dd2b63c0c96a2 *man/getBclustClassIntervals.Rd 1e55d7a5756ddc636f99668e26e716ee *man/jenks.tests.Rd d34f597812c5bffe8840e1a34ee5fca6 *man/logLik.classIntervals.Rd 82f1914717b463cbe6e94c91c8559f2d *src/fish1.f 0cd11b71e236d7d1521456c02f7c308b *src/init.c e08e641d0e13b787df791fb56736761d *tests/test_Unique.R f1c9e6195f0e86ab797ecd34ed705821 *tests/test_Unique.Rout.save bda2bf62a3c4f367d9af16a83800f666 *vignettes/headtailsR.Rmd 84e6098d6ff5f3c093b2545fb8f81e08 *vignettes/refs_ht.bib classInt/inst/0000755000176200001440000000000013642404236013013 5ustar liggesusersclassInt/inst/ChangeLog0000644000176200001440000003327713642403720014576 0ustar liggesusers2020-03-27 Roger Bivand * : Merge pull request #26 from dieghernan/htvignette Add "headtails" vignette 2020-03-27 dieghernan * vignettes/.gitignore, vignettes/headtailsR.Rmd, vignettes/refs_ht.bib: Create vignette 2020-03-27 dieghernan * .gitignore, DESCRIPTION: Initial config of the vignette 2020-03-21 Roger Bivand * docs/404.html, docs/authors.html, docs/index.html, docs/pkgdown.yml, docs/reference/classIntervals.html, docs/reference/findColours.html, docs/reference/findCols.html, docs/reference/getBclustClassIntervals.html, docs/reference/index.html, docs/reference/jenks.tests.html, docs/reference/logLik.classIntervals.html: rebuild pkgdown 2020-03-21 Roger Bivand * README.md: ping 2020-03-21 Roger Bivand * DESCRIPTION: add missing ) 2020-03-21 Roger Bivand * : Merge pull request #25 from dieghernan/patch-1 add ctb 2020-03-21 Roger Bivand * DESCRIPTION: Update DESCRIPTION 2020-03-21 Roger Bivand * DESCRIPTION: Update DESCRIPTION 2020-03-21 Roger Bivand * DESCRIPTION: Update DESCRIPTION Add Diego as ctb 2020-03-21 Roger Bivand * : Merge pull request #24 from dieghernan/master Add "headtails" #20 2020-03-20 dieghernan * man/classIntervals.Rd: Update man with "headtails" 2020-03-20 dieghernan * R/classInt.R: Change on initialitation of breaks: - Benefit of this is that now breaks on sytle "headtails" are initialised with units 2020-03-20 dieghernan * R/classInt.R: Update "headtails" following #22 2020-03-17 dieghernan * R/classInt.R: sytle "headtails" implemented 2020-03-20 Roger Bivand * : Merge pull request #23 from dieghernan/stylesnon Fix #22 2019-12-03 Roger Bivand * DESCRIPTION, R/classInt.R: fixes #19 2019-12-03 Roger Bivand * : commit 9f2fc7976578d00860bc4d72674ffe131d7062ed Author: Roger Bivand Date: Tue Dec 3 15:33:36 2019 +0100 2019-10-18 Roger Bivand * : Merge pull request #17 from wibeasley/patch-1 link to pkgdown site 2019-10-18 Will Beasley * DESCRIPTION: link to pkgdown site for example, like [tidyr](https://github.com/tidyverse/tidyr/blob/master/DESCRIPTION) 2019-09-17 Roger Bivand * DESCRIPTION: Update DESCRIPTION 2019-09-17 Roger Bivand * : Merge pull request #14 from billdenney/add-ctb Add Bill Denney as a contributor 2019-09-17 Bill Denney * DESCRIPTION: Add Bill Denney as a contributor 2019-09-17 Bill Denney * NAMESPACE: Include stats imports for logLik 2019-09-17 Bill Denney * tests/test_Unique.Rout.save: Update test to show logLik tests 2019-09-17 Bill Denney * R/logLik.R, man/logLik.classIntervals.Rd, tests/test_Unique.R: Improve documentation, add tests 2019-09-16 Bill Denney * DESCRIPTION, NAMESPACE, R/logLik.R, man/logLik.classIntervals.Rd: Add logLik method (Fix #6) 2019-08-11 Edzer Pebesma * DESCRIPTION, R/classInt.R: bump version, add support for Date 2019-07-23 Roger Bivand * : commit 33c05c2763ef7aa25d5a31446a36b6ed131729fd Author: Roger Bivand Date: Tue Jul 23 17:49:45 2019 +0200 2019-07-23 Roger Bivand * : Merge pull request #9 from r-spatial/trytravis try units travis 2019-07-23 Roger Bivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: added POSIXt and units support 2019-05-09 Roger Bivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: restrict sampling to relevant styles 2019-04-23 Roger Bivand * DESCRIPTION, tests/test_Unique.R, tests/test_Unique.Rout.save: reverse R 3.6 dependency 2019-04-22 Roger Bivand * docs/authors.html, docs/index.html, docs/reference/classIntervals.html, docs/reference/jenks.tests.html: update docs 2019-04-22 Roger Bivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd, tests/test_Unique.Rout.save: R 3.6 sample ready and sample jenks/fisher like QGIS #7 2019-03-25 Roger Bivand * docs/reference/classIntervals.html, docs/reference/findColours.html, docs/reference/findCols.html, docs/reference/getBclustClassIntervals.html, docs/reference/index.html, docs/reference/jenks.tests.html: update docs for dpih 2019-03-25 Roger Bivand * DESCRIPTION, NAMESPACE, R/classInt.R, man/classIntervals.Rd: add dpih 2018-12-18 Roger Bivand * DESCRIPTION: Update DESCRIPTION 2018-12-18 Roger Bivand * DESCRIPTION: Update DESCRIPTION Add NeedsCompilation=yes 2018-12-18 Roger Bivand * : Merge pull request #5 from angela-li/master Add a note to the README re: compiling from source 2018-12-18 Angela Li * README.md: Add a heading re: compiling from source 2018-12-18 Angela Li * README.md: Add note about making sure gfortran is installed 2018-12-18 Roger Bivand * DESCRIPTION, NAMESPACE, docs/_config.yml, docs/authors.html, docs/index.html, docs/reference/classIntervals.html, docs/reference/findColours.html, docs/reference/findCols.html, docs/reference/getBclustClassIntervals.html, docs/reference/index.html, docs/reference/jenks.tests.html, man/classIntervals.Rd, man/findColours.Rd, man/findCols.Rd, man/getBclustClassIntervals.Rd, man/jenks.tests.Rd, tests/test_Unique.Rout.save: move spData to Suggests 2018-12-14 Roger Bivand * docs/_config.yml: Set theme jekyll-theme-minimal 2018-12-13 Roger Bivand * .Rbuildignore, DESCRIPTION, docs/authors.html, docs/docsearch.css, docs/docsearch.js, docs/index.html, docs/link.svg, docs/pkgdown.css, docs/pkgdown.js, docs/pkgdown.yml, docs/reference/classIntervals.html, docs/reference/findColours.html, docs/reference/findCols.html, docs/reference/getBclustClassIntervals.html, docs/reference/index.html, docs/reference/jenks.tests.html: add pkgdown 2018-04-16 Roger Bivand * tests/test_Unique.Rout.save: add coerce to shingle to NAMESPACE 2018-04-16 Roger Bivand * DESCRIPTION: add coerce to shingle to NAMESPACE 2018-04-16 Roger Bivand * NAMESPACE: add coerce to shingle to NAMESPACE 2017-11-03 Roger Bivand * .travis.yml: Update .travis.yml 2017-11-02 Roger Bivand * README.md: Update README.md 2017-11-02 Roger Bivand * NAMESPACE, tests/test_Unique.Rout.save: rebasing data to spData 2017-11-02 Roger Bivand * DESCRIPTION, inst/ChangeLog, man/classIntervals.Rd, man/findColours.Rd, man/findCols.Rd, man/getBclustClassIntervals.Rd, man/jenks.tests.Rd, man/jenks71.Rd: rebasing data to spData 2017-11-02 Roger Bivand * ChangeLog, DESCRIPTION, oChangeLog: tidying old ChangeLog 2017-11-02 Roger Bivand * .Rbuildignore: Update .Rbuildignore 2017-11-02 Roger Bivand * README.md: add badges to README 2017-11-02 Roger Bivand * .travis.yml: Create .travis.yml 2017-11-02 Roger Bivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: adding argument to suppress too small n warning 2017-11-01 Roger Bivand * .Rbuildignore: Update .Rbuildignore 2017-11-01 Roger Bivand * README.md: Create README.md 2017-10-22 Edzer Pebesma * : commit 455886261f64b085130f8de88d06c86d474e4a6d Author: Edzer Pebesma Date: Sun Oct 22 13:42:03 2017 +0200 2017-10-22 Roger Bivand * DESCRIPTION: update DESCRIPTION 2017-10-22 Roger Bivand * initial git repo ## Historical record of SVN commits 2009-2017, CVS commits up to 2009 2017-04-14 11:31 rsbivand * DESCRIPTION, src/init.c: added registration 2015-09-28 17:49 rsbivand * ChangeLog, inst/ChangeLog: tidy 2015-09-28 17:49 rsbivand * ChangeLog, DESCRIPTION: tidy 2015-06-28 12:14 rsbivand * DESCRIPTION, NAMESPACE: CRAN _R_CHECK_CODE_USAGE_WITH_ONLY_BASE_ATTACHED_=true NAMESPACE tidy 2015-04-13 15:28 rsbivand * svn2cl.xsl: move to distributed svn2cl 2015-01-10 14:20 rsbivand * data/jenks71.rda: rebuild jenks71.rda 2015-01-10 14:19 rsbivand * DESCRIPTION, data/jenks71.rda: rebuild jenks71.rda 2015-01-06 12:03 rsbivand * DESCRIPTION: tidy 2015-01-06 12:02 rsbivand * DESCRIPTION: tidy 2015-01-06 09:32 rsbivand * ChangeLog, inst/ChangeLog, man/classIntervals.Rd: improvements to jenks documentation 2015-01-05 20:00 rsbivand * ChangeLog, inst/ChangeLog: tidy 2015-01-05 20:00 rsbivand * DESCRIPTION, man/classIntervals.Rd: improvements to jenks documentation 2014-04-06 17:05 rsbivand * ChangeLog: close ring in Polygon 2013-08-30 11:55 rsbivand * ChangeLog, inst/ChangeLog: tidy 2013-08-30 11:54 rsbivand * .Rbuildignore, ChangeLog, inst/ChangeLog: tidy 2013-08-29 14:26 rsbivand * DESCRIPTION, NAMESPACE: tidy 2013-07-28 19:37 rsbivand * DESCRIPTION, NAMESPACE: thinning load depends 2013-06-22 14:40 rsbivand * ChangeLog, inst/ChangeLog: tidy 2013-06-22 14:39 rsbivand * man/classIntervals.Rd, man/findColours.Rd, man/findCols.Rd, man/jenks.tests.Rd: help line lengths 2013-06-22 14:33 rsbivand * ChangeLog, inst/ChangeLog: tidy 2013-06-22 14:33 rsbivand * DESCRIPTION: tidy 2013-06-22 14:31 rsbivand * ChangeLog, inst/ChangeLog: tidy 2013-06-12 10:46 rsbivand * man/classIntervals.Rd, man/findColours.Rd: add more documentation on cutlabels= argument 2013-02-07 10:43 rsbivand * R/classInt.R: handle non-integer GRASS parameters more forgivingly 2012-11-05 17:05 rsbivand * ChangeLog, inst/ChangeLog: tidy 2012-11-05 17:04 rsbivand * DESCRIPTION: tidy 2012-07-22 13:30 rsbivand * DESCRIPTION: Authors@R classInt 2012-07-16 13:50 rsbivand * ChangeLog, inst/ChangeLog: tidy 2012-07-16 13:49 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd, tests, tests/test_Unique.R, tests/test_Unique.Rout.save: adding unique revisions, documentation and tests 2012-07-05 17:42 rsbivand * DESCRIPTION: add unique label option, check intervalClusure 2012-07-05 17:41 rsbivand * R/classInt.R, man/classIntervals.Rd: add unique label option, check intervalClusure 2011-11-21 10:34 rsbivand * R/classInt.R, man/classIntervals.Rd: change jenks storage mode to double 2011-11-14 10:58 rsbivand * ChangeLog, inst/ChangeLog: tidy 2011-11-10 07:30 rsbivand * ChangeLog, inst/ChangeLog: dots in fixed style 2011-11-10 07:29 rsbivand * DESCRIPTION, R/classInt.R: dots in fixed style 2011-10-21 15:56 rsbivand * DESCRIPTION, R/classInt.R: classInt NA handling 2011-05-26 21:22 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: block Inf warning in print.classIntervals 2011-02-22 16:37 rsbivand * ChangeLog: tidy 2011-02-22 16:24 rsbivand * oChangeLog, svn2cl.xsl: tidy 2011-02-22 16:18 rsbivand * .: tidy 2009-12-21 10:09 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: classInterval to shingle 2009-10-20 10:22 rsbivand * ChangeLog, inst/ChangeLog: argument passing 2009-10-20 10:19 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: argument passing 2009-09-17 10:19 rsbivand * DESCRIPTION, man/classIntervals.Rd, man/findColours.Rd, ChangeLog: fix documentation links 2009-05-25 12:20 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd, man/findColours.Rd, ChangeLog: representation update 2 2009-05-25 08:17 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: representation overhaul 1 2009-05-12 10:33 rsbivand * ChangeLog: tidy 2009-05-12 10:33 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: correction to jenks style intervals 2008-01-18 22:40 rsbivand * DESCRIPTION: jenks 2007-11-21 19:13 rsbivand * DESCRIPTION, R/classInt.R, man/classIntervals.Rd: Jenks 2007-09-04 14:49 rsbivand * ChangeLog: Changelog 2007-08-24 09:20 rsbivand * DESCRIPTION, man/classIntervals.Rd: methods Rd 2006-12-07 19:19 rsbivand * DESCRIPTION, src/fish1.f: E300 2006-03-20 09:30 rsbivand * DESCRIPTION, NAMESPACE, R/classInt.R, man/classIntervals.Rd, man/findColours.Rd, src/fish1.f: 1-5 2006-03-10 14:13 rsbivand * DESCRIPTION, NAMESPACE, R/classInt.R, data/jenks71.rda, man/classIntervals.Rd, man/findColours.Rd, man/findCols.Rd, man/getBclustClassIntervals.Rd, man/jenks.tests.Rd, man/jenks71.Rd: Initial revision 2006-03-10 14:13 rsbivand * DESCRIPTION, NAMESPACE, R/classInt.R, data/jenks71.rda, man/classIntervals.Rd, man/findColours.Rd, man/findCols.Rd, man/getBclustClassIntervals.Rd, man/jenks.tests.Rd, man/jenks71.Rd: initial import classInt/inst/doc/0000755000176200001440000000000013642404236013560 5ustar liggesusersclassInt/inst/doc/headtailsR.html0000644000176200001440000213353113642404236016536 0ustar liggesusers Head/Tails breaks on the classInt package.

Head/Tails breaks on the classInt package.

Diego Hernangomez

2020-04-05

There are far more ordinary people (say, 80 percent) than extraordinary people (say, 20 percent); this is often characterized by the 80/20 principle, based on the observation made by the Italian economist Vilfredo Pareto in 1906 that 80% of land in Italy was owned by 20% of the population. A histogram of the data values for these phenomena would reveal a right-skewed or heavy-tailed distribution. How to map the data with the heavy-tailed distribution?
Jiang (2013)

Abstract

This vignette discusses the implementation of the “Head/tail breaks” style (Jiang (2013)) on the classIntervals function. A step-by-step example is presented in order to clarify the method. A case study using spData::afcon is also included, as well as a test suite checking the performance and validation of the implementation.

Introduction

The Head/tail breaks, sometimes referred as ht-index (Jiang and Yin (2013)), is a classification scheme introduced by Jiang (2013) in order to find groupings or hierarchy for data with a heavy-tailed distribution.

Heavy-tailed distributions are heavily right skewed, with a minority of large values in the head and a majority of small values in the tail. This imbalance between the head and tail, or between many small values and a few large values, can be expressed as “far more small things than large things”.

Heavy tailed distributions are commonly characterized by a power law, a lognormal or an exponential function. Nature, society, finance (Vasicek (2002)) and our daily lives are full of rare and extreme events, which are termed “black swan events” (Taleb (2008)). This line of thinking provides a good reason to reverse our thinking by focusing on low-frequency events.

Breaking method

The method itself consists on a four-step process performed recursively until a stopping condition is satisfied. Given a vector of values var the process can be described as follows:

  1. Compute mu = mean(var).
  2. Break var into the tail (as var < mu) and the head (as var > mu).
  3. Assess if the proportion of head over var is lower or equal than a given threshold (i.e. length(head)/length(var) <= thr)
  4. If 3 is TRUE, repeat 1 to 3 until the condition is FALSE or no more partitions are possible (i.e. head has less than two elements expressed as length(head) < 2).

It is important to note that, at the beginning of a new iteration, var is replaced by head. The underlying hypothesis is to create partitions until the head and the tail are balanced in terms of distribution.So the stopping criteria is satisfied when the last head and the last tail are evenly balanced.

In terms of threshold, Jiang, Liu, and Jia (2013) set 40% as a good approximation, meaning that if the head contains more than 40% of the observations the distribution is not considered heavy-tailed.

The final breaks are the vector of consecutive mu.

Step by step example

We reproduce here the pseudo-code1 as per Jiang (2019):

Recursive function Head/tail Breaks:
 Rank the input data from the largest to the smallest
 Break the data into the head and the tail around the mean;
 // the head for those above the mean
 // the tail for those below the mean
 While (head <= 40%):
 Head/tail Breaks (head);
End Function

A step-by-step example in R (for illustrative purposes) has been developed:

As it can be seen, in each iteration the resulting head gradually loses the high-tail property, until the stopping condition is met.

iter mu prop n_var n_head
1 5.6755 14.5% 1000 145
2 27.2369 21.38% 145 31
3 85.1766 19.35% 31 6
4 264.7126 50% 6 3

The resulting breaks are then defined as breaks = c(min(var), mu(iter=1), ..., mu(iter), max(var)).

Implementation on classInt package

The implementation in the classIntervals function should replicate the results:

As stated in Jiang (2013), the number of breaks is naturally determined, however the thr parameter could help to adjust the final number. A lower value on thr would provide less breaks while a larger thr would increase the number, if the underlying distribution follows the “far more small things than large things” principle.

The method always returns at least one break, corresponding to mean(var).

Case study

Jiang (2013) states that “the new classification scheme is more natural than the natural breaks in finding the groupings or hierarchy for data with a heavy-tailed distribution.” (p. 482), referring to Jenks’ natural breaks method. In this case study we would compare “headtails” vs. “fisher”, that is the alias for the Fisher-Jenks algorithm and it is always preferred to the “jenks” style (see ?classIntervals). For this example we will use the afcon dataset from spData package.

Let’s have a look to the Top 10 values and the distribution of the variable totcon (index of total conflict 1966-78):

name totcon
EG EGYPT 5246
SU SUDAN 4751
UG UGANDA 3134
CG ZAIRE 3087
TZ TANZANIA 2881
LY LIBYA 2355
KE KENYA 2273
SO SOMALIA 2122
ET ETHIOPIA 1878
SF SOUTH AFRICA 1875

The data shows that EG and SU data present a clear hierarchy over the rest of values. As per the histogram, we can confirm a heavy-tailed distribution and therefore the “far more small things than large things” principle.

As a testing proof, on top of “headtails” and “fisher” we would use also “quantile” to have a broader view on the different breaking styles. As “quantile” is a position-based metric, it doesn’t account for the magnitude of F(x) (hierarchy), so the breaks are solely defined by the position of x on the distribution.

Applying the three aforementioned methods to break the data:

It is observed that the top three classes of “headtails” enclose 5 observations, whereas “fisher” includes 13 observations. In terms of classification, “headtails” breaks focuses more on extreme values.

The next plot compares a continuous distribution of totcon re-escalated to a range of [1,nclass] versus the distribution across breaks for each style. The continuous distribution has been offset by -0.5 in order to align the continuous and the discrete distributions.

It can be observed that the distribution of “headtails” breaks is also heavy-tailed, and closer to the original distribution. On the other extreme, “quantile” provides a quasi-uniform distribution, ignoring the totcon hierarchy

In terms of data visualization, we compare here the final map using the techniques mentioned above. On this plotting exercise:

  • cex of points are always between 1 and 5.
  • For the continuous approach, no classes are provided. This plot will be used as the reference.
  • For all the rest of styles, col and cex on each point is defined as per the class of that point.
custompal <- c("#FE9F6D99",
               "#DE496899",
               "#8C298199",
               "#3B0F7099",
               "#00000499")

afcon$cex_points <- help_reescale(afcon$totcon,
                                  min = 1,
                                  max = 5)
opar <- par(no.readonly = TRUE)
par(mar = c(1.5, 1.5, 2, 1.5), cex = 0.8)
# Plot continuous
plot(
  x = afcon$x,
  y = afcon$y,
  axes = FALSE,
  cex = afcon$cex_points,
  pch = 20,
  col = "grey50",
  main = "Continuous",
)

mcont <- (max(afcon$totcon) - min(afcon$totcon)) / 4
legcont <- 1:5 * mcont - (mcont - min(afcon$totcon))

legend("bottomleft",
       xjust = 1,
       bty = "n",
       legend = paste0("   ",
                  round(legcont, 0)
                  ),
       col = "grey50",
  pt.cex = seq(1, 5),
  pch = 20,
  title = "totcon"
)
box()

plot(
  x = afcon$x,
  y = afcon$y,
  axes = FALSE,
  cex = afcon$ht_breaks,
  pch = 20,
  col = custompal[afcon$ht_breaks],
  main = "headtails"
)
legend(
  "bottomleft",
  xjust = 1,
  bty = "n",
  legend = paste0("   ",
                  round(brks_ht$brks[2:6],0)
                  ),
  col = custompal,
  pt.cex = seq(1, 5),
  pch = 20,
  title = "totcon"
)
box()

plot(
  x = afcon$x,
  y = afcon$y,
  axes = FALSE,
  cex = afcon$fisher_breaks,
  pch = 20,
  col = custompal[afcon$fisher_breaks],
  main = "fisher"
)
legend(
  "bottomleft",
  xjust = 1,
  bty = "n",
  legend = paste0("   ",
                  round(brks_fisher$brks[2:6],0)
                  ),
  col = custompal,
  pt.cex = seq(1, 5),
  pch = 20,
  title = "totcon"
)
box()

plot(
  x = afcon$x,
  y = afcon$y,
  axes = FALSE,
  cex = afcon$quantile_break,
  pch = 20,
  col = custompal[afcon$quantile_break],
  main = "quantile"
)
legend(
  "bottomleft",
  xjust = 1,
  bty = "n",
  legend = paste0("   ",
                  round(brks_quantile$brks[2:6],0)
                  ),
  col = custompal,
  pt.cex = seq(1, 5),
  pch = 20,
  title = "totcon"
)
box()

par(opar)

As per the results, “headtails” seems to provide a better understanding of the most extreme values when the result is compared against the continuous plot. The “quantile” style, as expected, just provides a clustering without taking into account the real hierarchy. The “fisher” plot is in-between of these two interpretations.

It is also important to note that “headtails” and “fisher” reveal different information that can be useful depending of the context. While “headtails” highlights the outliers, it fails on providing a good clustering on the tail, while “fisher” seems to reflect better these patterns. This can be observed on the values of Western Africa and the Niger River Basin, where “headtails” doesn’t highlight any special cluster of conflicts, “fisher” suggests a potential cluster. This can be confirmed on the histogram generated previously, where a concentration of totcon around 1,000 is visible.

Testing and benchmark

On this section the performance of the “headtails” implementation is tested, in terms of speed and handling of corner cases. A small benchmark with another styles is also presented.

Testing has been performed over the following distributions:

Heavy-tailed distributions

  • Pareto
  • Exponential
  • Log-normal
  • Weibull
  • Log-Cauchy, also known as super-heavy tail distribution (Falk, Huesler, and Reiss (2011), p. 80, Fraga Alves, Haan, and Neves (2009))

Non heavy-tailed distributions

  • Normal (non heavy-tailed)
  • Truncated Normal (left-tailed)
  • Uniform distribution

Let’s define a helper function and proceed to run the whole test suite:

testresults <- data.frame(
  Title = NA,
  style = NA,
  nsample  = NA,
  thresold = NA,
  nbreaks = NA,
  time_secs = NA
)

benchmarkdist <-
  function(dist,
           style = "headtails",
           thr = 0.4,
           title = "",
           plot = FALSE) {
    init <- Sys.time()
    br <- classIntervals(dist, style = style, thr = thr)
    a <- Sys.time() - init
    test <- data.frame(
      Title = title,
      style  = style,
      nsample  = format(length(br$var), 
                        scientific = FALSE, big.mark = ","),
      thresold = thr,
      nbreaks = length(br$brks) - 1,
      time_secs = as.character(round(a,4))
    )
    testresults <- unique(rbind(testresults, test))
    
    if (plot) {
      plot(
        density(br$var,
                from = quantile(dist,.0005),
                to = quantile(dist,.9995)
                ),
        col = "black",
        cex.main = .9,
        main = paste0(
          title,
          " ",
          style,
          ", thr =",
          thr,
          ", nbreaks = ",
          length(br$brks) - 1
        ),
        ylab = "",
        xlab = ""
      )
      abline(v = br$brks,
             col = "red3",
             lty = 2)
    }
    return(testresults)
  }
opar <- par(no.readonly = TRUE)
par(mar = c(2, 2, 2, 2), cex = 0.8)

# Pareto----
testresults <- benchmarkdist(paretodist, title = "Pareto", plot = TRUE)
testresults <- benchmarkdist(paretodist, title = "Pareto", thr = 0)
testresults <- benchmarkdist(paretodist, title = "Pareto", thr = .75, plot = TRUE)

#Sample 2,000 obs
set.seed(1234)
Paretosamp <- sample(paretodist, 2000)
testresults <- benchmarkdist(Paretosamp,
                             title = "Pareto sample",
                             style = "fisher",
                             plot = TRUE)
testresults <- benchmarkdist(Paretosamp,
                             title = "Pareto sample",
                             style = "headtails",
                             plot = TRUE)


#Exponential----
testresults <- benchmarkdist(expdist, title = "Exponential", plot = TRUE)
testresults <- benchmarkdist(expdist, title = "Exponential", thr = 0)
testresults <- benchmarkdist(expdist, title = "Exponential", thr = 1)
testresults <- benchmarkdist(expdist, title = "Exponential",
                             style = "quantile", plot = TRUE)

#Weibull-----
testresults <- benchmarkdist(weibulldist, title = "Weibull", plot = TRUE)
testresults <- benchmarkdist(weibulldist, title = "Weibull", thr = 0)
testresults <- benchmarkdist(weibulldist, title = "Weibull", thr = 1)

#Logcauchy
testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", plot = TRUE)
testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", thr = 0)
testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", thr = 1)

#Normal----
testresults <- benchmarkdist(normdist, title = "Normal", plot = TRUE)
testresults <- benchmarkdist(normdist, title = "Normal", thr = 0)
testresults <- benchmarkdist(normdist, title = "Normal", thr = 1, plot = TRUE)

#Truncated Left-tail Normal----
testresults <- benchmarkdist(leftnorm, title = "Left Normal", plot = TRUE)
testresults <- benchmarkdist(leftnorm, title = "Left Normal", thr = -100)
testresults <- benchmarkdist(leftnorm, title = "Left Normal", plot = TRUE, thr = 500)

#Uniform----
testresults <- benchmarkdist(unifdist, title = "Uniform", plot = TRUE, thr = 0.7)
testresults <- benchmarkdist(unifdist, title = "Uniform", thr = 0)
testresults <- benchmarkdist(unifdist, title = "Uniform", plot = TRUE, thr = 1)
par(opar)

# Results
knitr::kable(testresults[-1, ], row.names = FALSE)
Title style nsample thresold nbreaks time_secs
Pareto headtails 5,000,000 0.40 15 0.6361
Pareto headtails 5,000,000 0.00 2 0.4134
Pareto headtails 5,000,000 0.75 15 0.4864
Pareto sample fisher 2,000 0.40 12 0.0234
Pareto sample headtails 2,000 0.40 8 3e-04
Exponential headtails 5,000,000 0.40 16 0.5284
Exponential headtails 5,000,000 0.00 2 0.4364
Exponential headtails 5,000,000 1.00 17 0.5724
Exponential quantile 5,000,000 0.40 24 0.8606
Weibull headtails 5,000,000 0.40 16 0.5363
Weibull headtails 5,000,000 0.00 2 0.4414
Weibull headtails 5,000,000 1.00 17 0.5376
LogCauchy headtails 4,991,187 0.40 6 0.4102
LogCauchy headtails 4,991,187 0.00 2 0.4556
LogCauchy headtails 4,991,187 1.00 6 0.4562
Normal headtails 5,000,000 0.40 2 0.4963
Normal headtails 5,000,000 0.00 2 0.4534
Normal headtails 5,000,000 1.00 17 0.552
Left Normal headtails 5,000,000 0.40 2 0.6246
Left Normal headtails 5,000,000 -100.00 2 0.585
Left Normal headtails 5,000,000 500.00 21 0.7941
Uniform headtails 5,000,000 0.70 22 0.4963
Uniform headtails 5,000,000 0.00 2 0.4122
Uniform headtails 5,000,000 1.00 22 0.5308

The implementation works as expected, with a good performance given the size of the sample, and also compares well with another current implementations on classIntervals.

References

Falk, Michael, Juerg Huesler, and Rolf-Dieter Reiss. 2011. Laws of Small Numbers: Extremes and Rare Events. Laws of Small Numbers: Extremes and Rare Events. https://doi.org/10.1007/978-3-0348-0009-9.

Fraga Alves, Maria, L.D. Haan, and Claudia Neves. 2009. “Statistical Inference for Heavy and Super-Heavy Tailed Distributions.” J. Stat. Plan. Inf. 139 (January): 213–27.

Jiang, Bin. 2013. “Head/Tail Breaks: A New Classification Scheme for Data with a Heavy-Tailed Distribution.” The Professional Geographer 65 (3): 482–94. https://doi.org/10.1080/00330124.2012.700499.

———. 2019. “A Recursive Definition of Goodness of Space for Bridging the Concepts of Space and Place for Sustainability.” Sustainability 11 (15): 4091. https://doi.org/10.3390/su11154091.

Jiang, Bin, Xintao Liu, and Tao Jia. 2013. “Scaling of Geographic Space as a Universal Rule for Map Generalization.” Annals of the Association of American Geographers 103 (4): 844–55. https://doi.org/10.1080/00045608.2013.765773.

Jiang, Bin, and Junjun Yin. 2013. “Ht-Index for Quantifying the Fractal or Scaling Structure of Geographic Features.” Annals of the Association of American Geographers 104 (3): 530–40. https://doi.org/10.1080/00045608.2013.834239.

Taleb, Nassim Nicholas. 2008. The Black Swan: The Impact of the Highly Improbable. 1st ed. London: Random House.

Vasicek, Oldrich. 2002. “Loan Portfolio Value.” Risk, December, 160–62.


  1. The method implemented on classInt corresponds to head/tails 1.0 as named on this article.

classInt/inst/doc/headtailsR.Rmd0000644000176200001440000005105213637450523016313 0ustar liggesusers--- title: "Head/Tails breaks on the `classInt` package." author: "Diego Hernangomez" date: '`r Sys.Date()`' output: rmarkdown::html_vignette: toc: true number_sections: false toc_depth: 1 bibliography: refs_ht.bib vignette: > %\VignetteIndexEntry{"Head/Tails breaks on the `classInt` package."} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- >*There are far more ordinary people (say, 80 percent) than extraordinary people (say, 20 percent); this is often characterized by the 80/20 principle, based on the observation made by the Italian economist Vilfredo Pareto in 1906 that 80% of land in Italy was owned by 20% of the population. A histogram of the data values for these phenomena would reveal a right-skewed or heavy-tailed distribution. How to map the data with the heavy-tailed distribution?* >
@Jiang_2013
# Abstract This vignette discusses the implementation of the "Head/tail breaks" style (@Jiang_2013) on the `classIntervals` function. A step-by-step example is presented in order to clarify the method. A case study using `spData::afcon` is also included, as well as a test suite checking the performance and validation of the implementation. ```{r setup, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` # Introduction The **Head/tail breaks**, sometimes referred as **ht-index** (@Jiang2_2013), is a classification scheme introduced by @Jiang_2013 in order to find groupings or hierarchy for data with a heavy-tailed distribution. Heavy-tailed distributions are heavily right skewed, with a minority of large values in the head and a majority of small values in the tail. This imbalance between the head and tail, or between many small values and a few large values, can be expressed as *"far more small things than large things"*. Heavy tailed distributions are commonly characterized by a power law, a lognormal or an exponential function. Nature, society, finance (@vasicek2012) and our daily lives are full of rare and extreme events, which are termed "black swan events" (@taleb_black_2008). This line of thinking provides a good reason to reverse our thinking by focusing on low-frequency events. ```{r charheavytail,fig.show='hold'} library(classInt) #1. Characterization of heavy-tail distributions---- set.seed(1234) #Pareto distribution a=1 b=1.161 n=1000 sample_par <- 1 / (1 - runif(1000)) ^ (1 / 1.161) opar <- par(no.readonly = TRUE) par(mar = c(2, 4, 3, 1), cex = 0.8) plot( sort(sample_par, decreasing = TRUE), type = "l", ylab = "F(x)", xlab = "", main = "80/20 principle" ) abline(h = quantile(sample_par, .8) , lty = 2, col = "red3") abline(v = 0.2*length(sample_par) , lty = 2, col = "darkblue") legend( "topleft", legend = c("F(x): p80", "x: Top 20%"), col = c("red3", "darkblue"), lty = 2, cex = 0.8 ) hist( sample_par, n = 100, xlab = "", main = "Histogram", col = "grey50", border = NA, probability = TRUE ) par(opar) ``` # Breaking method The method itself consists on a four-step process performed recursively until a stopping condition is satisfied. Given a vector of values `var` the process can be described as follows: 1. Compute `mu = mean(var)`. 2. Break `var` into the `tail` (as `var < mu`) and the `head` (as `var > mu`). 3. Assess if the proportion of `head` over `var` is lower or equal than a given threshold (i.e. `length(head)/length(var) <= thr`) 4. If 3 is `TRUE`, repeat 1 to 3 until the condition is `FALSE` or no more partitions are possible (i.e. `head` has less than two elements expressed as `length(head) < 2`). It is important to note that, at the beginning of a new iteration, `var` is replaced by `head`. The underlying hypothesis is to create partitions until the head and the tail are balanced in terms of distribution.So the stopping criteria is satisfied when the last head and the last tail are evenly balanced. In terms of threshold, @Jiang3_2013 set 40% as a good approximation, meaning that if the head contains more than 40% of the observations the distribution is not considered heavy-tailed. The final breaks are the vector of consecutive `mu`. # Step by step example We reproduce here the pseudo-code^[The method implemented on `classInt` corresponds to head/tails 1.0 as named on this article.] as per @Jiang_2019: ``` Recursive function Head/tail Breaks: Rank the input data from the largest to the smallest Break the data into the head and the tail around the mean; // the head for those above the mean // the tail for those below the mean While (head <= 40%): Head/tail Breaks (head); End Function ``` A step-by-step example in **R** (for illustrative purposes) has been developed: ```{r stepbystep, fig.show='hold'} opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 3, 1), cex = 0.8) var <- sample_par thr <- .4 brks <- c(min(var), max(var)) #Initialise with min and max sum_table <- data.frame( iter = 0, mu = NA, prop = NA, n_var = NA, n_head = NA ) #Pars for chart limchart <- brks #Iteration for (i in 1:10) { mu <- mean(var) brks <- sort(c(brks, mu)) head <- var[var > mu] prop <- length(head) / length(var) stopit <- prop < thr & length(head) > 1 sum_table = rbind(sum_table, c(i, mu, prop, length(var), length(head))) hist( var, main = paste0("Iter ", i), breaks = 50, col = "grey50", border = NA, xlab = "", xlim = limchart ) abline(v = mu, col = "red3", lty = 2) ylabel <- max(hist(var, breaks = 50, plot = FALSE)$counts) labelplot <- paste0("PropHead: ", round(prop * 100, 2), "%") text( x = mu, y = ylabel, labels = labelplot, cex = 0.8, pos = 4 ) legend( "right", legend = paste0("mu", i), col = c("red3"), lty = 2, cex = 0.8 ) if (isFALSE(stopit)) break var <- head } par(opar) ``` As it can be seen, in each iteration the resulting head gradually loses the high-tail property, until the stopping condition is met. ```{r hiddtable, echo=FALSE} sum_table$mu <- round(sum_table$mu,4) sum_table$prop <- paste0(round(100*sum_table$prop,2),"%") knitr::kable(sum_table[!is.na(sum_table$mu),], row.names = FALSE) ``` The resulting breaks are then defined as `breaks = c(min(var), mu(iter=1), ..., mu(iter), max(var))`. # Implementation on `classInt` package The implementation in the `classIntervals` function should replicate the results: ```{r checkmethod} ht_sample_par <- classIntervals(sample_par, style = "headtails") brks == ht_sample_par$brks print(ht_sample_par) ``` As stated in @Jiang_2013, the number of breaks is naturally determined, however the `thr` parameter could help to adjust the final number. A lower value on `thr` would provide less breaks while a larger `thr` would increase the number, if the underlying distribution follows the *"far more small things than large things"* principle. ```{r examplesimp, fig.show='hold', fig.asp=.7} opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 1), cex = 0.8) pal1 <- c("wheat1", "wheat2", "red3") # Minimum: single break print(classIntervals(sample_par, style = "headtails", thr = 0)) plot( classIntervals(sample_par, style = "headtails", thr = 0), pal = pal1, main = "thr = 0" ) # Two breaks print(classIntervals(sample_par, style = "headtails", thr = 0.2)) plot( classIntervals(sample_par, style = "headtails", thr = 0.2), pal = pal1, main = "thr = 0.2" ) # Default breaks: 0.4 print(classIntervals(sample_par, style = "headtails")) plot(classIntervals(sample_par, style = "headtails"), pal = pal1, main = "thr = Default") # Maximum breaks print(classIntervals(sample_par, style = "headtails", thr = 1)) plot( classIntervals(sample_par, style = "headtails", thr = 1), pal = pal1, main = "thr = 1" ) par(opar) ``` The method always returns at least one break, corresponding to `mean(var)`. # Case study @Jiang_2013 states that "the new classification scheme is more natural than the natural breaks in finding the groupings or hierarchy for data with a heavy-tailed distribution." (p. 482), referring to Jenks' natural breaks method. In this case study we would compare "headtails" vs. "fisher", that is the alias for the Fisher-Jenks algorithm and it is always preferred to the "jenks" style (see `?classIntervals`). For this example we will use the `afcon` dataset from `spData` package. ```{r loadspdata, message=FALSE} library(spData) data(afcon, package = "spData") ``` Let's have a look to the Top 10 values and the distribution of the variable `totcon` (index of total conflict 1966-78): ```{r summspdata, fig.show='hold'} # Top10 knitr::kable(head(afcon[order(afcon$totcon, decreasing = TRUE),c("name","totcon")],10)) opar <- par(no.readonly = TRUE) par(mar = c(4, 4, 3, 1), cex = 0.8) hist(afcon$totcon, n = 20, main = "Histogram", xlab = "totcon", col = "grey50", border = NA, ) plot( density(afcon$totcon), main = "Distribution", xlab = "totcon", ) par(opar) ``` The data shows that EG and SU data present a clear hierarchy over the rest of values. As per the histogram, we can confirm a heavy-tailed distribution and therefore the *"far more small things than large things"* principle. As a testing proof, on top of "headtails" and "fisher" we would use also "quantile" to have a broader view on the different breaking styles. As "quantile" is a position-based metric, it doesn't account for the magnitude of F(x) (hierarchy), so the breaks are solely defined by the position of x on the distribution. Applying the three aforementioned methods to break the data: ```{r breaksample,fig.show='hold'} brks_ht <- classIntervals(afcon$totcon, style = "headtails") print(brks_ht) #Same number of classes for "fisher" nclass <- length(brks_ht$brks) - 1 brks_fisher <- classIntervals(afcon$totcon, style = "fisher", n = nclass) print(brks_fisher) brks_quantile <- classIntervals(afcon$totcon, style = "quantile", n = nclass) print(brks_quantile) pal1 <- c("wheat1", "wheat2", "red3") opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 1), cex = 0.8) plot(brks_ht, pal = pal1, main = "headtails") plot(brks_fisher, pal = pal1, main = "fisher") plot(brks_quantile, pal = pal1, main = "quantile") par(opar) ``` It is observed that the top three classes of "headtails" enclose 5 observations, whereas "fisher" includes 13 observations. In terms of classification, "headtails" breaks focuses more on extreme values. The next plot compares a continuous distribution of `totcon` re-escalated to a range of `[1,nclass]` versus the distribution across breaks for each style. The continuous distribution has been offset by -0.5 in order to align the continuous and the discrete distributions. ```{r benchmarkbreaks, fig.show='hold', fig.width=7} #Helper function to reescale values help_reescale <- function(x, min = 1, max = 10) { r <- (x - min(x)) / (max(x) - min(x)) r <- r * (max - min) + min return(r) } afcon$ecdf_class <- help_reescale(afcon$totcon, min = 1 - 0.5, max = nclass - 0.5) afcon$ht_breaks <- cut(afcon$totcon, brks_ht$brks, labels = FALSE, include.lowest = TRUE) afcon$fisher_breaks <- cut(afcon$totcon, brks_fisher$brks, labels = FALSE, include.lowest = TRUE) afcon$quantile_break <- cut(afcon$totcon, brks_quantile$brks, labels = FALSE, include.lowest = TRUE) opar <- par(no.readonly = TRUE) par(mar = c(4, 4, 1, 1), cex = 0.8) plot( density(afcon$ecdf_class), ylim = c(0, 0.8), lwd = 2, main = "", xlab = "class" ) lines(density(afcon$ht_breaks), col = "darkblue", lty = 2) lines(density(afcon$fisher_breaks), col = "limegreen", lty = 2) lines(density(afcon$quantile_break), col = "red3", lty = 2) legend("topright", legend = c("Continuous", "headtails", "fisher", "quantile"), col = c("black", "darkblue", "limegreen", "red3"), lwd = c(2, 1, 1, 1), lty = c(1, 2, 2, 2), cex = 0.8 ) par(opar) ``` It can be observed that the distribution of "headtails" breaks is also heavy-tailed, and closer to the original distribution. On the other extreme, "quantile" provides a quasi-uniform distribution, ignoring the `totcon` hierarchy In terms of data visualization, we compare here the final map using the techniques mentioned above. On this plotting exercise: - `cex` of points are always between `1` and `5`. - For the continuous approach, no classes are provided. This plot will be used as the reference. - For all the rest of styles, `col` and `cex` on each point is defined as per the class of that point. ```{r finalplot , fig.show='hold', fig.asp=1.2} custompal <- c("#FE9F6D99", "#DE496899", "#8C298199", "#3B0F7099", "#00000499") afcon$cex_points <- help_reescale(afcon$totcon, min = 1, max = 5) opar <- par(no.readonly = TRUE) par(mar = c(1.5, 1.5, 2, 1.5), cex = 0.8) # Plot continuous plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$cex_points, pch = 20, col = "grey50", main = "Continuous", ) mcont <- (max(afcon$totcon) - min(afcon$totcon)) / 4 legcont <- 1:5 * mcont - (mcont - min(afcon$totcon)) legend("bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(legcont, 0) ), col = "grey50", pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$ht_breaks, pch = 20, col = custompal[afcon$ht_breaks], main = "headtails" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_ht$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$fisher_breaks, pch = 20, col = custompal[afcon$fisher_breaks], main = "fisher" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_fisher$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$quantile_break, pch = 20, col = custompal[afcon$quantile_break], main = "quantile" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_quantile$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() par(opar) ``` As per the results, "headtails" seems to provide a better understanding of the most extreme values when the result is compared against the continuous plot. The "quantile" style, as expected, just provides a clustering without taking into account the real hierarchy. The "fisher" plot is in-between of these two interpretations. It is also important to note that "headtails" and "fisher" reveal different information that can be useful depending of the context. While "headtails" highlights the outliers, it fails on providing a good clustering on the tail, while "fisher" seems to reflect better these patterns. This can be observed on the values of Western Africa and the Niger River Basin, where "headtails" doesn't highlight any special cluster of conflicts, "fisher" suggests a potential cluster. This can be confirmed on the histogram generated previously, where a concentration of `totcon` around 1,000 is visible. # Testing and benchmark On this section the performance of the "headtails" implementation is tested, in terms of speed and handling of corner cases. A small benchmark with another styles is also presented. Testing has been performed over the following distributions: **Heavy-tailed distributions** - Pareto - Exponential - Log-normal - Weibull - Log-Cauchy, also known as super-heavy tail distribution (@Falk_2011, p. 80, @Fraga_2009) **Non heavy-tailed distributions** - Normal (non heavy-tailed) - Truncated Normal (left-tailed) - Uniform distribution ```{r distest, fig.show='hold'} #Init samples set.seed(2389) #Pareto distributions a=7 b=14 paretodist <- 7 / (1 - runif(5000000)) ^ (1 / 14) #Exponential dist expdist <- rexp(5000000) #Lognorm lognormdist <- rlnorm(5000000) #Weibull weibulldist <- rweibull(5000000, 1, scale = 5) #LogCauchy "super-heavy tail" logcauchdist <- exp(rcauchy(5000000, 2, 4)) #Remove Inf logcauchdist <- logcauchdist[logcauchdist < Inf] #Normal dist normdist <- rnorm(5000000) #Left-tailed distr leftnorm <- sample(rep(normdist[normdist < mean(normdist)], 3), size = 5000000) #Uniform distribution unifdist <- runif(5000000) ``` Let's define a helper function and proceed to run the whole test suite: ```{r testresults, fig.show='hold'} testresults <- data.frame( Title = NA, style = NA, nsample = NA, thresold = NA, nbreaks = NA, time_secs = NA ) benchmarkdist <- function(dist, style = "headtails", thr = 0.4, title = "", plot = FALSE) { init <- Sys.time() br <- classIntervals(dist, style = style, thr = thr) a <- Sys.time() - init test <- data.frame( Title = title, style = style, nsample = format(length(br$var), scientific = FALSE, big.mark = ","), thresold = thr, nbreaks = length(br$brks) - 1, time_secs = as.character(round(a,4)) ) testresults <- unique(rbind(testresults, test)) if (plot) { plot( density(br$var, from = quantile(dist,.0005), to = quantile(dist,.9995) ), col = "black", cex.main = .9, main = paste0( title, " ", style, ", thr =", thr, ", nbreaks = ", length(br$brks) - 1 ), ylab = "", xlab = "" ) abline(v = br$brks, col = "red3", lty = 2) } return(testresults) } opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 2), cex = 0.8) # Pareto---- testresults <- benchmarkdist(paretodist, title = "Pareto", plot = TRUE) testresults <- benchmarkdist(paretodist, title = "Pareto", thr = 0) testresults <- benchmarkdist(paretodist, title = "Pareto", thr = .75, plot = TRUE) #Sample 2,000 obs set.seed(1234) Paretosamp <- sample(paretodist, 2000) testresults <- benchmarkdist(Paretosamp, title = "Pareto sample", style = "fisher", plot = TRUE) testresults <- benchmarkdist(Paretosamp, title = "Pareto sample", style = "headtails", plot = TRUE) #Exponential---- testresults <- benchmarkdist(expdist, title = "Exponential", plot = TRUE) testresults <- benchmarkdist(expdist, title = "Exponential", thr = 0) testresults <- benchmarkdist(expdist, title = "Exponential", thr = 1) testresults <- benchmarkdist(expdist, title = "Exponential", style = "quantile", plot = TRUE) #Weibull----- testresults <- benchmarkdist(weibulldist, title = "Weibull", plot = TRUE) testresults <- benchmarkdist(weibulldist, title = "Weibull", thr = 0) testresults <- benchmarkdist(weibulldist, title = "Weibull", thr = 1) #Logcauchy testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", plot = TRUE) testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", thr = 0) testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", thr = 1) #Normal---- testresults <- benchmarkdist(normdist, title = "Normal", plot = TRUE) testresults <- benchmarkdist(normdist, title = "Normal", thr = 0) testresults <- benchmarkdist(normdist, title = "Normal", thr = 1, plot = TRUE) #Truncated Left-tail Normal---- testresults <- benchmarkdist(leftnorm, title = "Left Normal", plot = TRUE) testresults <- benchmarkdist(leftnorm, title = "Left Normal", thr = -100) testresults <- benchmarkdist(leftnorm, title = "Left Normal", plot = TRUE, thr = 500) #Uniform---- testresults <- benchmarkdist(unifdist, title = "Uniform", plot = TRUE, thr = 0.7) testresults <- benchmarkdist(unifdist, title = "Uniform", thr = 0) testresults <- benchmarkdist(unifdist, title = "Uniform", plot = TRUE, thr = 1) par(opar) # Results knitr::kable(testresults[-1, ], row.names = FALSE) ``` The implementation works as expected, with a good performance given the size of the sample, and also compares well with another current implementations on `classIntervals`. # References classInt/inst/doc/headtailsR.R0000644000176200001440000003106213642404235015764 0ustar liggesusers## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----charheavytail,fig.show='hold'-------------------------------------------- library(classInt) #1. Characterization of heavy-tail distributions---- set.seed(1234) #Pareto distribution a=1 b=1.161 n=1000 sample_par <- 1 / (1 - runif(1000)) ^ (1 / 1.161) opar <- par(no.readonly = TRUE) par(mar = c(2, 4, 3, 1), cex = 0.8) plot( sort(sample_par, decreasing = TRUE), type = "l", ylab = "F(x)", xlab = "", main = "80/20 principle" ) abline(h = quantile(sample_par, .8) , lty = 2, col = "red3") abline(v = 0.2*length(sample_par) , lty = 2, col = "darkblue") legend( "topleft", legend = c("F(x): p80", "x: Top 20%"), col = c("red3", "darkblue"), lty = 2, cex = 0.8 ) hist( sample_par, n = 100, xlab = "", main = "Histogram", col = "grey50", border = NA, probability = TRUE ) par(opar) ## ----stepbystep, fig.show='hold'---------------------------------------------- opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 3, 1), cex = 0.8) var <- sample_par thr <- .4 brks <- c(min(var), max(var)) #Initialise with min and max sum_table <- data.frame( iter = 0, mu = NA, prop = NA, n_var = NA, n_head = NA ) #Pars for chart limchart <- brks #Iteration for (i in 1:10) { mu <- mean(var) brks <- sort(c(brks, mu)) head <- var[var > mu] prop <- length(head) / length(var) stopit <- prop < thr & length(head) > 1 sum_table = rbind(sum_table, c(i, mu, prop, length(var), length(head))) hist( var, main = paste0("Iter ", i), breaks = 50, col = "grey50", border = NA, xlab = "", xlim = limchart ) abline(v = mu, col = "red3", lty = 2) ylabel <- max(hist(var, breaks = 50, plot = FALSE)$counts) labelplot <- paste0("PropHead: ", round(prop * 100, 2), "%") text( x = mu, y = ylabel, labels = labelplot, cex = 0.8, pos = 4 ) legend( "right", legend = paste0("mu", i), col = c("red3"), lty = 2, cex = 0.8 ) if (isFALSE(stopit)) break var <- head } par(opar) ## ----hiddtable, echo=FALSE---------------------------------------------------- sum_table$mu <- round(sum_table$mu,4) sum_table$prop <- paste0(round(100*sum_table$prop,2),"%") knitr::kable(sum_table[!is.na(sum_table$mu),], row.names = FALSE) ## ----checkmethod-------------------------------------------------------------- ht_sample_par <- classIntervals(sample_par, style = "headtails") brks == ht_sample_par$brks print(ht_sample_par) ## ----examplesimp, fig.show='hold', fig.asp=.7--------------------------------- opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 1), cex = 0.8) pal1 <- c("wheat1", "wheat2", "red3") # Minimum: single break print(classIntervals(sample_par, style = "headtails", thr = 0)) plot( classIntervals(sample_par, style = "headtails", thr = 0), pal = pal1, main = "thr = 0" ) # Two breaks print(classIntervals(sample_par, style = "headtails", thr = 0.2)) plot( classIntervals(sample_par, style = "headtails", thr = 0.2), pal = pal1, main = "thr = 0.2" ) # Default breaks: 0.4 print(classIntervals(sample_par, style = "headtails")) plot(classIntervals(sample_par, style = "headtails"), pal = pal1, main = "thr = Default") # Maximum breaks print(classIntervals(sample_par, style = "headtails", thr = 1)) plot( classIntervals(sample_par, style = "headtails", thr = 1), pal = pal1, main = "thr = 1" ) par(opar) ## ----loadspdata, message=FALSE------------------------------------------------ library(spData) data(afcon, package = "spData") ## ----summspdata, fig.show='hold'---------------------------------------------- # Top10 knitr::kable(head(afcon[order(afcon$totcon, decreasing = TRUE),c("name","totcon")],10)) opar <- par(no.readonly = TRUE) par(mar = c(4, 4, 3, 1), cex = 0.8) hist(afcon$totcon, n = 20, main = "Histogram", xlab = "totcon", col = "grey50", border = NA, ) plot( density(afcon$totcon), main = "Distribution", xlab = "totcon", ) par(opar) ## ----breaksample,fig.show='hold'---------------------------------------------- brks_ht <- classIntervals(afcon$totcon, style = "headtails") print(brks_ht) #Same number of classes for "fisher" nclass <- length(brks_ht$brks) - 1 brks_fisher <- classIntervals(afcon$totcon, style = "fisher", n = nclass) print(brks_fisher) brks_quantile <- classIntervals(afcon$totcon, style = "quantile", n = nclass) print(brks_quantile) pal1 <- c("wheat1", "wheat2", "red3") opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 1), cex = 0.8) plot(brks_ht, pal = pal1, main = "headtails") plot(brks_fisher, pal = pal1, main = "fisher") plot(brks_quantile, pal = pal1, main = "quantile") par(opar) ## ----benchmarkbreaks, fig.show='hold', fig.width=7---------------------------- #Helper function to reescale values help_reescale <- function(x, min = 1, max = 10) { r <- (x - min(x)) / (max(x) - min(x)) r <- r * (max - min) + min return(r) } afcon$ecdf_class <- help_reescale(afcon$totcon, min = 1 - 0.5, max = nclass - 0.5) afcon$ht_breaks <- cut(afcon$totcon, brks_ht$brks, labels = FALSE, include.lowest = TRUE) afcon$fisher_breaks <- cut(afcon$totcon, brks_fisher$brks, labels = FALSE, include.lowest = TRUE) afcon$quantile_break <- cut(afcon$totcon, brks_quantile$brks, labels = FALSE, include.lowest = TRUE) opar <- par(no.readonly = TRUE) par(mar = c(4, 4, 1, 1), cex = 0.8) plot( density(afcon$ecdf_class), ylim = c(0, 0.8), lwd = 2, main = "", xlab = "class" ) lines(density(afcon$ht_breaks), col = "darkblue", lty = 2) lines(density(afcon$fisher_breaks), col = "limegreen", lty = 2) lines(density(afcon$quantile_break), col = "red3", lty = 2) legend("topright", legend = c("Continuous", "headtails", "fisher", "quantile"), col = c("black", "darkblue", "limegreen", "red3"), lwd = c(2, 1, 1, 1), lty = c(1, 2, 2, 2), cex = 0.8 ) par(opar) ## ----finalplot , fig.show='hold', fig.asp=1.2--------------------------------- custompal <- c("#FE9F6D99", "#DE496899", "#8C298199", "#3B0F7099", "#00000499") afcon$cex_points <- help_reescale(afcon$totcon, min = 1, max = 5) opar <- par(no.readonly = TRUE) par(mar = c(1.5, 1.5, 2, 1.5), cex = 0.8) # Plot continuous plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$cex_points, pch = 20, col = "grey50", main = "Continuous", ) mcont <- (max(afcon$totcon) - min(afcon$totcon)) / 4 legcont <- 1:5 * mcont - (mcont - min(afcon$totcon)) legend("bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(legcont, 0) ), col = "grey50", pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$ht_breaks, pch = 20, col = custompal[afcon$ht_breaks], main = "headtails" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_ht$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$fisher_breaks, pch = 20, col = custompal[afcon$fisher_breaks], main = "fisher" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_fisher$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$quantile_break, pch = 20, col = custompal[afcon$quantile_break], main = "quantile" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_quantile$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() par(opar) ## ----distest, fig.show='hold'------------------------------------------------- #Init samples set.seed(2389) #Pareto distributions a=7 b=14 paretodist <- 7 / (1 - runif(5000000)) ^ (1 / 14) #Exponential dist expdist <- rexp(5000000) #Lognorm lognormdist <- rlnorm(5000000) #Weibull weibulldist <- rweibull(5000000, 1, scale = 5) #LogCauchy "super-heavy tail" logcauchdist <- exp(rcauchy(5000000, 2, 4)) #Remove Inf logcauchdist <- logcauchdist[logcauchdist < Inf] #Normal dist normdist <- rnorm(5000000) #Left-tailed distr leftnorm <- sample(rep(normdist[normdist < mean(normdist)], 3), size = 5000000) #Uniform distribution unifdist <- runif(5000000) ## ----testresults, fig.show='hold'--------------------------------------------- testresults <- data.frame( Title = NA, style = NA, nsample = NA, thresold = NA, nbreaks = NA, time_secs = NA ) benchmarkdist <- function(dist, style = "headtails", thr = 0.4, title = "", plot = FALSE) { init <- Sys.time() br <- classIntervals(dist, style = style, thr = thr) a <- Sys.time() - init test <- data.frame( Title = title, style = style, nsample = format(length(br$var), scientific = FALSE, big.mark = ","), thresold = thr, nbreaks = length(br$brks) - 1, time_secs = as.character(round(a,4)) ) testresults <- unique(rbind(testresults, test)) if (plot) { plot( density(br$var, from = quantile(dist,.0005), to = quantile(dist,.9995) ), col = "black", cex.main = .9, main = paste0( title, " ", style, ", thr =", thr, ", nbreaks = ", length(br$brks) - 1 ), ylab = "", xlab = "" ) abline(v = br$brks, col = "red3", lty = 2) } return(testresults) } opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 2), cex = 0.8) # Pareto---- testresults <- benchmarkdist(paretodist, title = "Pareto", plot = TRUE) testresults <- benchmarkdist(paretodist, title = "Pareto", thr = 0) testresults <- benchmarkdist(paretodist, title = "Pareto", thr = .75, plot = TRUE) #Sample 2,000 obs set.seed(1234) Paretosamp <- sample(paretodist, 2000) testresults <- benchmarkdist(Paretosamp, title = "Pareto sample", style = "fisher", plot = TRUE) testresults <- benchmarkdist(Paretosamp, title = "Pareto sample", style = "headtails", plot = TRUE) #Exponential---- testresults <- benchmarkdist(expdist, title = "Exponential", plot = TRUE) testresults <- benchmarkdist(expdist, title = "Exponential", thr = 0) testresults <- benchmarkdist(expdist, title = "Exponential", thr = 1) testresults <- benchmarkdist(expdist, title = "Exponential", style = "quantile", plot = TRUE) #Weibull----- testresults <- benchmarkdist(weibulldist, title = "Weibull", plot = TRUE) testresults <- benchmarkdist(weibulldist, title = "Weibull", thr = 0) testresults <- benchmarkdist(weibulldist, title = "Weibull", thr = 1) #Logcauchy testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", plot = TRUE) testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", thr = 0) testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", thr = 1) #Normal---- testresults <- benchmarkdist(normdist, title = "Normal", plot = TRUE) testresults <- benchmarkdist(normdist, title = "Normal", thr = 0) testresults <- benchmarkdist(normdist, title = "Normal", thr = 1, plot = TRUE) #Truncated Left-tail Normal---- testresults <- benchmarkdist(leftnorm, title = "Left Normal", plot = TRUE) testresults <- benchmarkdist(leftnorm, title = "Left Normal", thr = -100) testresults <- benchmarkdist(leftnorm, title = "Left Normal", plot = TRUE, thr = 500) #Uniform---- testresults <- benchmarkdist(unifdist, title = "Uniform", plot = TRUE, thr = 0.7) testresults <- benchmarkdist(unifdist, title = "Uniform", thr = 0) testresults <- benchmarkdist(unifdist, title = "Uniform", plot = TRUE, thr = 1) par(opar) # Results knitr::kable(testresults[-1, ], row.names = FALSE)