quantiles-0.7.1/.gitignore010064400007660000024000000000271273506053100137350ustar0000000000000000target Cargo.lock *.bk quantiles-0.7.1/.rustfmt.toml010064400007660000024000000001141303373440000144140ustar0000000000000000reorder_imports = true reorder_imported_names = true wrap_comments = true quantiles-0.7.1/.travis.yml010064400007660000024000000014211303424063700140550ustar0000000000000000language: rust rust: - stable - beta - nightly matrix: allow_failures: - rust: nightly before_install: - sudo apt-get update addons: apt: packages: - libcurl4-openssl-dev - libelf-dev - libdw-dev - cmake - gcc - binutils-dev after_success: | wget https://github.com/SimonKagstrom/kcov/archive/master.tar.gz && tar xzf master.tar.gz && cd kcov-master && mkdir build && cd build && cmake .. && make && sudo make install && cd ../.. && rm -rf kcov-master && for file in target/debug/quantiles-*; do mkdir -p "target/cov/$(basename $file)"; kcov --exclude-pattern=/.cargo,/usr/lib --verify "target/cov/$(basename $file)" "$file"; done && bash <(curl -s https://codecov.io/bash) && echo "Uploaded code coverage" quantiles-0.7.1/benches/ckms.rs010064400007660000024000000066721321360537700147000ustar0000000000000000#![feature(test)] extern crate test; extern crate quantiles; mod ckms { #[derive(Debug, Clone, Copy)] pub struct Xorshift { seed: u64, } impl Xorshift { pub fn new(seed: u64) -> Xorshift { Xorshift { seed: seed } } pub fn next_val(&mut self) -> u32 { // implementation inspired by // https://github.com/astocko/xorshift/blob/master/src/splitmix64.rs use std::num::Wrapping as w; let mut z = w(self.seed) + w(0x9E37_79B9_7F4A_7C15_u64); let nxt_seed = z.0; z = (z ^ (z >> 30)) * w(0xBF58_476D_1CE4_E5B9_u64); z = (z ^ (z >> 27)) * w(0x94D0_49BB_1331_11EB_u64); self.seed = nxt_seed; u32::from((z ^ (z >> 31)).0 as u16) } } use quantiles::ckms::CKMS; use test::Bencher; macro_rules! generate_tests { ($t:ty, $fn:ident, $s:expr) => { #[bench] fn $fn(b: &mut Bencher) { let mut xshft = Xorshift::new(1972); b.iter(|| { let mut ckms = CKMS::<$t>::new(0.001); for _ in 0..$s { let val = xshft.next_val(); ckms.insert(val as $t); } }); } } } macro_rules! generate_primed_tests { ($t:ty, $fn:ident, $s:expr) => { #[bench] fn $fn(b: &mut Bencher) { let mut xshft = Xorshift::new(1972); let mut ckms = CKMS::<$t>::new(0.001); for _ in 0..1_000_000 { let elem = xshft.next_val() as $t; ckms.insert(elem); } b.iter(|| { let elem = xshft.next_val() as $t; ckms.insert(elem); }); } } } mod u16 { use super::*; generate_tests!(u16, bench_insert_100, 100); generate_tests!(u16, bench_insert_1000, 1000); generate_tests!(u16, bench_insert_10000, 10_000); generate_tests!(u16, bench_insert_65535, 65_535); generate_primed_tests!(u16, bench_primed_100, 100); generate_primed_tests!(u16, bench_primed_1000, 1000); generate_primed_tests!(u16, bench_primed_10000, 10_000); generate_primed_tests!(u16, bench_primed_65535, 65_535); } mod u32 { use super::*; generate_tests!(u32, bench_insert_100, 100); generate_tests!(u32, bench_insert_1000, 1000); generate_tests!(u32, bench_insert_10000, 10_000); generate_tests!(u32, bench_insert_100000, 100_000); generate_primed_tests!(u32, bench_primed_100, 100); generate_primed_tests!(u32, bench_primed_1000, 1000); generate_primed_tests!(u32, bench_primed_10000, 10_000); generate_primed_tests!(u32, bench_primed_65535, 65_535); } mod f32 { use super::*; generate_tests!(f32, bench_insert_100, 100); generate_tests!(f32, bench_insert_1000, 1000); generate_tests!(f32, bench_insert_10000, 10_000); generate_tests!(f32, bench_insert_100000, 100_000); generate_primed_tests!(f32, bench_primed_100, 100); generate_primed_tests!(f32, bench_primed_1000, 1000); generate_primed_tests!(f32, bench_primed_10000, 10_000); generate_primed_tests!(f32, bench_primed_65535, 65_535); } } quantiles-0.7.1/Cargo.toml.orig010064400007660000024000000011201324566637300146440ustar0000000000000000[package] name = "quantiles" version = "0.7.1" authors = ["Brian L. Troutwine "] description = "a collection of approximate quantile algorithms" repository = "https://github.com/postmates/quantiles" readme = "README.md" license = "MIT" keywords = ["statistics", "histogram", "quantiles", "percentiles", "approximation"] [profile.release] lto = true [dev-dependencies] quickcheck = "0.5" [dependencies] serde = { version = "1.0", optional = true } serde_derive = { version = "1.0", optional = true } [features] default = [] serde_support = ["serde", "serde_derive"] quantiles-0.7.1/Cargo.toml0000644000000021370000000000000111060ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g. crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] name = "quantiles" version = "0.7.1" authors = ["Brian L. Troutwine "] description = "a collection of approximate quantile algorithms" readme = "README.md" keywords = ["statistics", "histogram", "quantiles", "percentiles", "approximation"] license = "MIT" repository = "https://github.com/postmates/quantiles" [profile.release] lto = true [dependencies.serde] version = "1.0" optional = true [dependencies.serde_derive] version = "1.0" optional = true [dev-dependencies.quickcheck] version = "0.5" [features] default = [] serde_support = ["serde", "serde_derive"] quantiles-0.7.1/codecov.yml010064400007660000024000000005561304474547600141350ustar0000000000000000coverage: precision: 2 round: down range: 70...100 status: # Learn more at http://docs.codecov.io/docs/codecov-yaml changes: false patch: default: target: 100% project: default: target: 94% threshold: 1% comment: layout: "header, diff" behavior: default # update if exists else create new quantiles-0.7.1/LICENSE.txt010064400007660000024000000020571273506176400136060ustar0000000000000000MIT License Copyright (c) 2016 Postmates Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. quantiles-0.7.1/README.md010064400007660000024000000126521304474546600132460ustar0000000000000000# quantiles [![Build Status](https://travis-ci.org/postmates/quantiles.svg?branch=master)](https://travis-ci.org/postmates/quantiles) [![Codecov](https://img.shields.io/codecov/c/github/postmates/quantiles.svg)](https://codecov.io/gh/postmates/quantiles) [![Crates.io](https://img.shields.io/crates/v/quantiles.svg)](https://crates.io/crates/quantiles) This crate is intended to be a collection of approxiate quantile algorithms that provide guarantees around space and computation. Recent literature has advanced approximation techniques but none are generally applicable and have fundamental tradeoffs. Initial work was done to support internal Postmates projects but the hope is that the crate can be generally useful. ## The Algorithms ### CKMS - Effective Computation of Biased Quantiles over Data Streams This is an implementation of the algorithm presented in Cormode, Korn, Muthukrishnan, Srivastava's paper "Effective Computation of Biased Quantiles over Data Streams". The ambition here is to approximate quantiles on a stream of data without having a boatload of information kept in memory. This implementation follows the [IEEE version](http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=1410103&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D1410103) of the paper. The authors' self-published copy of the paper is incorrect and this implementation will _not_ make sense if you follow along using that version. Only the 'full biased' invariant is used. The 'targeted quantiles' variant of this algorithm is fundamentally flawed, an issue which the authors correct in their "Space- and Time-Efficient Deterministic Algorithms for Biased Quantiles over Data Streams" ```rust use quantiles::CKMS; let mut ckms = CKMS::::new(0.001); for i in 1..1001 { ckms.insert(i as u16); } assert_eq!(ckms.query(0.0), Some((1, 1))); assert_eq!(ckms.query(0.998), Some((998, 998))); assert_eq!(ckms.query(0.999), Some((999, 999))); assert_eq!(ckms.query(1.0), Some((1000, 1000))); ``` Queries provide an approximation to the true quantile, +/- εΦn. In the above, ε is set to 0.001, n is 1000. Minimum and maximum quantiles--0.0 and 1.0--are already precise. The error for the middle query is then +/- 0.998. (This so happens to be the exact quantile, but that doesn't always hold.) For an error ε this structure will require `T*(floor(1/(2*ε)) + O(1/ε log εn)) + f64 + usize + usize` words of storage, where T is the specialized type. In local testing, insertion per point takes approximately 4 microseconds with a variance of 7%. This comes to 250k points per second. ### Misra Gries - ε-approximate frequency counts Misra-Gries calculates an ε-approximate frequency count for a stream of N elements. The output is the k most frequent elements. 1. the approximate count f'[e] is smaller than the true frequency f[e] of e, but by at most εN, i.e., (f[e] - εN) ≤ f'[e] ≤ f[e] 2. any element e with a frequency f[e] ≥ εN appears in the result set The error bound ε = 1/(k+1) where k is the number of counters used in the algorithm. When k = 1 i.e. a single counter, the algorithm is equivalent to the Boyer-Moore Majority algorithm. If you want to check for elements that appear at least εN times, you will want to perform a second pass to calculate the exact frequencies of the values in the result set which can be done in constant space. ```rust use quantiles::misra_gries::*; let k: usize = 3; let numbers: Vec = vec![1,3,2,1,3,4,3,1,2,1]; let counts = misra_gries(numbers.iter(), k); let bound = numbers.len() / (k+1); let in_range = |f_expected: usize, f_approx: usize| { f_approx <= f_expected && (bound >= f_expected || f_approx >= (f_expected - bound)) }; assert!(in_range(4usize, *counts.get(&1).unwrap())); assert!(in_range(2usize, *counts.get(&2).unwrap())); assert!(in_range(3usize, *counts.get(&3).unwrap())); ``` ### Greenwald Khanna - ε-approximate quantiles Greenwald Khanna calculates ε-approximate quantiles. If the desired quantile is `φ`, the ε-approximate quantile is any element in the range of elements that rank between `⌊(φ-ε)N⌋` and `⌊(φ+ε)N⌋` The stream summary datastructure can cope with up to max[usize] observations. The beginning and end quantiles are clamped at the Minimum and maximum observed elements respectively. This page explains the theory: [http://www.mathcs.emory.edu/~cheung/Courses/584-StreamDB/Syllabus/08-Quantile/Greenwald.html](http://www.mathcs.emory.edu/~cheung/Courses/584-StreamDB/Syllabus/08-Quantile/Greenwald.html) ```rust use quantiles::greenwald_khanna::*; let epsilon = 0.01; let mut stream = Stream::new(epsilon); let n = 1001; for i in 1..n { stream.insert(i); } let in_range = |phi: f64, value: u32| { let lower = ((phi - epsilon) * (n as f64)) as u32; let upper = ((phi + epsilon) * (n as f64)) as u32; (epsilon > phi || lower <= value) && value <= upper }; assert!(in_range(0f64, *stream.quantile(0f64))); assert!(in_range(0.1f64, *stream.quantile(0.1f64))); assert!(in_range(0.2f64, *stream.quantile(0.2f64))); assert!(in_range(0.3f64, *stream.quantile(0.3f64))); assert!(in_range(0.4f64, *stream.quantile(0.4f64))); assert!(in_range(1f64, *stream.quantile(1f64))); ``` ## Upgrading ### 0.2 -> 0.3 This release introduces two new algorithms, "Greenwald Khanna" and "Misra Gries". The existing CKMS has been moved from root to its own submodule. You'll need to update your imports from ```rust use quantiles::CMKS; ``` to ```rust use quantiles::ckms::CKMS; ``` quantiles-0.7.1/resources/afl_crashes_20161215.txt010064400007660000024000000327461303373440000177630ustar0000000000000000p.0001 1.0 2.2 -3.2 8.0001 1.0 2.2 -3.2 0.8001 1.0 2.2 -3.2 0. 01 1.0 2.2 -3.2 0. 1.0 2.2 001 1.0 2.F3.2 0.M0!1.0 2.00!1.0 2.0p1 1.0 .0 2.0p1 1.0 2.d p1 0@0001 1.0 2 2 -3.2 0.$001 1 0 2.2 -3.2 0.0010 2.2 -3.2 .2 0.00.0.00.2 -32 2222222222222.2 2 -32222222222222222.2 0.00.2 7322220.2 732222222222222222.2 222222222222.2 0,0X20 9999999999999949 -320 99999999999999490600.2 -2E222 2 228 -2E-0288 02288 2 02 -6E-00.2 -6E-0288 0.0>>.2 2E-0282 2 -2E-0282 200 2.2.20 9.0 2.2 .20K1 1E2001 P.0.20.2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  200 201 1 2001 11 0 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 201 1 0 2.2 01 1 201 0 1 0 201 1 0 2.01 1  1 0 201 1 0 2.001 11 0 0 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.2 H 2 0 2.2 0.20555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555501 1 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 0555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555501 1 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.2 H 2 0 2.2 0.200@ 11 0 1a0 201 0 1 0 201*1 0 2.2 01 1 100 201 1 2001 11 0 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 20K 1 :.2 01 1  1 0 201 1 201 0 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2001 112B0 0 1 0 201 0 1!0 201 1 0 2.2 C1 0 201 1 2001 11 0 1 0 2 1 0 201 1 0 2.2 01 1 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 0 1 0 201 1 0.2 0.200.2001 11 0 1 0 201 0 1 0 201 q 0 2.2 01 1  1 0 201 1 2001 11 0 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 201 1 0 2.2 01 1 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.1 11 0 1 0 201 0 1 0 201 q 0 2.2 01 1  1 0 201 1 2001 11 0 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 201 1 0 2.2 01 1 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.2 H 2 0 2.2 0.7001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 2001 11 0 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 201 1 0 2.2 01 1 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.2 H 001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0@1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2001 11 0 1 0 201 2000 1 0 201 1 0 2.2 011 0 201 1 2001 11 0 1 0 2 1 0 201 1 0 2.21 1 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 201 0 1 0 201 1 0 2.2@01 1  1 0 201 1 0 2.2 H 2 2 0 2.2 0.200 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 20P 1 2001 11 0 1 0 2001 11 0 R 0 201 0 1 0201 1 0 2.2 01 1  1 0 201 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 201 1 0 2.2 01 1 201 0 1 0 200201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 201 0 1 0 201(1 0 2.2 01 1 0 201 1 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 201 1 0 2.2 01 1 20101 1  1 0 201 1 0 2.2 H 2 0 2.2 0.200% 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1  201 1 2001 1 0 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 3001 11 0 1 0 2 1 0 201  0 2.2 001 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 201 0 1 0 201 10 2.2 01 1 0 201 1 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.2 2 0 2.2 0.2"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""  1 0 201 1 0 2001 11 0 1 0 201 0 1d 201 10 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 20D 1 0 2.2 01 1 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 20F 1 2001 11 0 1 0 201 0200 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.201 0 1d 201 10 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 20D 1 0 2.2 01 1 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 201 0 1 0 201 1 0 2.2 H 2 0 2.2 0.1 11 0 1 0 200200201 0 201 1 2001 11 0 1 0 200I 11 0 1 0 201 0 1 0 201 1 0 2.2 2.2 01 1  1 0  1 0 201 1 2001 11 0 1 0 200I 11 1 11 0 1  2 1 0 201 0 2.2 2.2 01 1  1 0 201 1 0 2O01 11 1 0 201 0 1 0 201 1 0@ 01 1 0 201 1 2001 11 0 1  2 1 0 201 1 0 2.2 01 2001 2 1 0 201 1 0 2.2 01 2.2  0 202.2 01 1   0 201 1 0 2.201 1 0 2001 11 0 1 0 201 0 1 0 201 1 201 0 201 1 2001 11 0 1 0 200I &1 0 1 0 201 0 1 0 2010@001 11 0 1 0 201 1 02.2 01 1  1 0 21 1 0 /.2 H 2 0 22000201 0 1 0 201 1 0 2.2 01 1  1 0 01 1200 20H1 11 0 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 01 1 0 2.2 01 1 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 01 0 201 0 1####################200#########################200########################################################### 0 201 1 0 2.2 01E 0 201 1 2001 11 0 1 0 201 0 1 0 2 1 0 2.2 01 1 1 0 2.2 01 1   0 201 1 0 .2 H 2 0 2002.2 0.201 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 2001 11 0 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 01 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 2001 11 0 1 0 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 201 1 0 2.2 01 1 201 0 10 201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 201 0 1 0 201 1 201 001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0 201 1 0 2.2 01 1 201 0 10 201 1 0 2.2 01 1  1 0 201 1 0 2.001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 0 2.2 H 2 0 2.2 0.J001 11 0 1 0 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 2001 11 0 1 0 2001 11 0 1 0 201 0 10 201 1 0 2.2 01 1  1 0 201 1 0 2001 11 0 1 0 201 0 1 0 201v1 0 2.2 01 1 0 201 1 2001 11 0 1 0 2 1 0201 1 0 2.2 01 1 201 0 1 0 201 1 0 2.2 01 1  1 0 201 1 2.001 111 0 201 1 2001 11 0 1 0 2001 11 0 1 0 201 0 10 201 1 0 2.2 01   1 0 201 1 0 2001 11 0 1 0 201 0 1 0 201v1 0 2.2 01 1 0 201 1 0 1 0 201 0 1 0 201 1 0 2.2 01 1 0 201 1 2001 11 0 1 0 201 0 1 0 201!1 0 2.2 01 1  1 0 201 1 0 2.2 H 2 0 2.2 0.201 }.0 2.2 .20 1.0 2.2 .1.2001 1 0 2.2 .20 1.0 2.2 .200 .2 020001 1.2001 1.01 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 .20.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .2 020001 1.2001 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -3A2 081 1.0 2.2 ."0 1.0 2.2 .21.2001 1 0 2.2 .202001 2.2 .2001 1.02. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2001 1 0 2.2 .20 1.0 102000 1.01 1%0 2.2 2.2 .2001 1.02. .2 020001 1.200 1.01 1%0 2.2 -3.2-3.0B200101.0 2.2 .20 1.0 2.2 .1.2001 1 0 2.2 .20 1.02.2 .200 .8 020001 *.2001 1.01 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 .20.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .2 020001 M.2000 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -3.2 001 1.0 2.2 ."0 1.0 2.2 .2002001 1 0 2.2.20 1.0 2.2 .21.2001 1 0 2.2 .20 1.02.2 .200 .8 020001 *.2001 1.01 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 .20.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .2 020001 M.1.0 2.2 .20!1 1.0$$$$$$$$$$$1 1.2001 1.01 1%0 2.2-3.2.2 .20 1.0 2.2 .20.2001 1 0 2.2 .20 1.200.2 .2001 1.0 1 0 2.2 .20 1.0 20!1 1.0$$$$$$$$$$$1 1.2001 1.01 1%0 2.2-3.2.2 .20 1.0 2.2 .20.2001 1 0 2.2 .20 1.200.2 .2001 1.0 1 0 2.2 .20 1.0 2.2. .2 020001 1.2001 1.01 1%0 2.2 -3.2-3.00C%. .6 .200C%. .6 .2001202p.2001 1.0 2.2 .20 1.0 2.2 .2002001 130 2.0000000000000000000000000000000000001 102. .2 020001 1.2001 1.01 1%0 2.2 -3.2-3.p.2001 1.0 2.2 .20 1.0 2.2 .2002001 1 0 2.2 .20 2.2 .200 .2 020001 1.2001 1.01 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 .20.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .2 020001 1.2001 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -3.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .20000000000000000000000000000000000000000000000000000000 0E00003.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .20000000000000000000000000000000000000000000000000000000000000000000 0E0000020000000000000000001 2.2 020001 1.2001 1.01 1%0 2.2p.2001 1.0 2.2 .20 1.0 2.2 .2002001 1 0 2.2 .20 1.0 2.2 .200 .2 1 1.2001 1.01 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 . 0.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .3 020001 1.2001 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -3.2 001 1.0 2.2 L"0 1.0 2.2 .21.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .200:000 2.2 L"0 100000000000000000000001 1.01 1 0 2.2 -3.2-3.Q.2001 1.0 2.2 .20 1.0 2.2 .2002001 1 0 2.2 .20 1.0 2.2 .200 .2 1 1.2001 1.01 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 . 0.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .3 020001 1.2001 1.01 2.2 -3.2 01 1.01 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 . 0.2001 1 0 2.2 .20 1.0 2.2 .2001 1. -3.2.2 .20 1.0 2.2 .20.22. .2 01 1%0 2.2 -3.2 005 1.0 2.2 L"0 1.0 2.2 .21.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .10 1.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .200:000 2.2 L"0 10000000000000000000001 1.01 1 0 2.2 -3.2-3.p.2001 1.0 2.2 .20 1.0 2.2 .2002001 1 0 2.2 .20 2.2 .200 .2 020001 1.2001 1.01 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 .20.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .2 020001 1.2001 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -3.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .20000000000000000000000000000000000000000000000000000000 0E00003.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .20000000000000000000000000000000000000000000000000000000000000000000 0E-000020000000000000000001 2.2 020001 1.2001 1.01 1%0 2.20.2001%1.0 2.2 .20 1.0 2.2 .2002001 1 0 2.2 .20 2.2 .200 .2 020001 1.2001 1.01 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 .20.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .2 020001 1.2001 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -32 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .20000000000000000000000000000000000000000000000000000000 0E00003.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1 1.2001 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -3.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .20000000000000000000000000000000000000000000000000000000 0E00003.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 11 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 .20.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .2 020001 1.2001 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -32 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .20000000000000000000000000000000000000000000000000000000 0E00003.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1 1.2001 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -3.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .20000000000000000000000000000000000000000000000000000000 0E00003.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%01.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .20000000000000000000000000000000000000.01 1%01.0 2.2 .20.2004 1 0 2.2 .20 1.0 2.2 .20000000000000000000000000000000000000000000000000000000000000000000 0E-000020000000000000000001 2.2 020001 1.2001 1.01 1%0 2.20.200q 1.0 2.2 .20 1.062000.2002001 1 R 2.2 .20 2.2 .200 .2 020001 1.2001 1.01 1%0 2.2 -3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 .20.2001 1 0 2.2 .20 1.0 2.2 .2001 1.02. .0220010002 020001 1.2001200 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -3.2 001 1.0 2.2 L"0 1.0 2.2 .2d 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.01 2.A -3.2 02. .2 01 1%0 2.2 -3.2 001 1.0 2.2 L"0 1.0 2.2 .2G 2.2 .20 1.0 2.2 .2001 1.0d. .2 02.2004 1 0 2.2 .-3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 .20.2001 1 0 2.22.20 1.0 2.2 2001 1.02. .0220010002 020001 1.2001200 1.01 2.2 -3.2 02. .2 01 1%0 2.2 -3.2 001 1.0 2.2 L"0 1.0 2.2 .2G 2. -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 .-3.2 .2 01 1%0 2.2 -3.2 001 1.0 21.0 2.2 .20.2001 1 0 2.22.20 1.0 2.2 2001 1.02. .0220010002 020001 1.2001200 1.01 2.A -3.2 02. .2 01 1%0 2.2 -3.2 001 1.0 2.2 L"0 1.0 2.2 .2G 2.2 .20 1.0 2.2 .2001 1.0d. .2 020001 1.2001 1.01 1%0 2.2 -3.2.2 .20 1.0 2.2 .20.2004 1 0 2.2 7.0066E-66 6.0066E-66 60.66E-66 006666666 60L00 0066E-66 6066E-66 6 00 0066E-66 6066E-66 6006666666E-66 6066E-66 6000 000 2066E-66 -66 6 0066E-66 60 6E-66 0 6E$66 0 00E-66 0 0 0066E-66 60 6_-6 0068E-66 60000 0062002E-66 6066E-66662 6 0J 0066E-66 6066555555555555550 00 0066E-66 6066E-66 E-66 666E-66 0 00 0076E-66 6066E-66 6000 0866E086E0 0866E-66 6 6066.6 -/6 6 606 quantiles-0.7.1/src/ckms/entry.rs010064400007660000024000000011771321360537700152140ustar0000000000000000use std::cmp; #[derive(Debug, Clone)] #[cfg_attr(feature = "serde_support", derive(Serialize, Deserialize))] pub struct Entry where T: PartialEq, { pub g: u32, pub delta: u32, pub v: T, } // The derivation of PartialEq for Entry is not appropriate. The sole ordering // value in an Entry is the value 'v'. impl PartialEq for Entry where T: PartialEq, { fn eq(&self, other: &Entry) -> bool { self.v == other.v } } impl PartialOrd for Entry where T: PartialOrd, { fn partial_cmp(&self, other: &Entry) -> Option { self.v.partial_cmp(&other.v) } } quantiles-0.7.1/src/ckms/mod.rs010064400007660000024000000513071324566635200146370ustar0000000000000000//! This is an implementation of the algorithm presented in Cormode, Korn, //! Muthukrishnan, Srivastava's paper "Effective Computation of Biased Quantiles //! over Data Streams". The ambition here is to approximate quantiles on a //! stream of data without having a boatload of information kept in memory. //! //! As of this writing you _must_ use the presentation in the IEEE version of //! the paper. The authors' self-published copy of the paper is incorrect and //! this implementation will _not_ make sense if you follow along using that //! version. Only the 'full biased' invariant is used. The 'targeted quantiles' //! variant of this algorithm is fundamentally flawed, an issue which the //! authors correct in their "Space- and Time-Efficient Deterministic Algorithms //! for Biased Quantiles over Data Streams" use std; use std::fmt::Debug; use std::ops::{Add, AddAssign, Div, Sub}; mod entry; mod store; use self::store::Store; /// A structure to provide approximate quantiles queries in bounded memory and /// with bounded error. #[derive(Clone, PartialEq, Debug)] #[cfg_attr(feature = "serde_support", derive(Serialize, Deserialize))] pub struct CKMS where T: Copy + PartialEq, { n: usize, // We follow the 'batch' method of the above paper. In this method, // incoming items are buffered in a priority queue, called 'buffer' here, // and once insert_threshold items are stored in the buffer it is drained // into the 'samples' collection. Insertion will cause some extranious // points to be held that can be merged. Once compress_threshold threshold // items are buffered the COMPRESS operation merges these extranious points. insert_threshold: usize, inserts: usize, // This is the S(n) of the above paper. Entries are stored here and // occasionally merged. The outlined implementation uses a linked list but // we prefer a Vec for reasons of cache locality at the cost of worse // computational complexity. samples: Store, cma: Option, last_in: Option, } impl AddAssign for CKMS where T: Copy + Add + Sub + Div + PartialOrd + Debug + std::convert::Into, { fn add_assign(&mut self, rhs: CKMS) { self.last_in = rhs.last_in; self.cma = match (self.cma, rhs.cma) { (None, None) => None, (None, Some(y)) => Some(y), (Some(x), None) => Some(x), (Some(x), Some(y)) => { let x_n: f64 = self.n as f64; let y_n: f64 = rhs.n as f64; Some(((x_n * x) + (y_n * y)) / (x_n + y_n)) } }; self.n += rhs.n; for inner in rhs.samples.data { for v in inner.data.iter().map(|x| x.v) { self.samples.insert(v); } } self.compress(); } } impl< T: Copy + PartialOrd + Debug + Add + Sub + Div + std::convert::Into, > CKMS { /// Create a new CKMS /// /// A CKMS is meant to answer quantile queries with a known error bound. If /// the error passed here is ε and there have been `n` items inserted into /// CKMS then for any quantile query Φ the deviance from the true quantile /// will be +/- εΦn. /// /// For an error ε this structure will require T*(floor(1/(2*ε)) + O(1/ε log /// εn)) + f64 + usize + usize words of storage. /// /// # Examples /// ``` /// use quantiles::ckms::CKMS; /// /// let mut ckms = CKMS::::new(0.001); /// for i in 1..1001 { /// ckms.insert(i as u32); /// } /// assert_eq!(ckms.query(0.0), Some((1, 1))); /// assert_eq!(ckms.query(0.998), Some((998, 998))); /// assert_eq!(ckms.query(0.999), Some((999, 999))); /// assert_eq!(ckms.query(1.0), Some((1000, 1000))); /// ``` /// /// `error` must but a value between 0 and 1, exclusive of both extremes. If /// you input an error <= 0.000_000_000_1 CKMS will assign an error of /// 0.000_000_000_1. Likewise, if your error is >= 1.0 CKMS will assign an /// error of 0.99. pub fn new(error: f64) -> CKMS { let error = if error <= 0.000_000_000_1 { 0.000_000_000_1 } else if error >= 1.0 { 0.99 } else { error }; let insert_threshold = 1.0 / (2.0 * error); let insert_threshold: usize = if insert_threshold < 1.0 { 1 } else { insert_threshold as usize }; CKMS { n: 0, insert_threshold: insert_threshold, inserts: 0, samples: Store::new(2048, error), last_in: None, cma: None, } } /// Return the last element added to the CKMS /// /// # Example /// ``` /// use quantiles::ckms::CKMS; /// /// let mut ckms = CKMS::new(0.1); /// ckms.insert(1.0); /// ckms.insert(2.0); /// ckms.insert(3.0); /// assert_eq!(Some(3.0), ckms.last()); /// ``` pub fn last(&self) -> Option { self.last_in } /// Return the cummulative moving average of the elements added to the CKMS /// /// # Example /// ``` /// use quantiles::ckms::CKMS; /// /// let mut ckms = CKMS::new(0.1); /// ckms.insert(0.0); /// ckms.insert(100.0); /// /// assert_eq!(Some(50.0), ckms.cma()); /// ``` pub fn cma(&self) -> Option { self.cma } /// Return the guaranteed error bound of this CKMS /// /// # Example /// ``` /// use quantiles::ckms::CKMS; /// /// let mut ckms: CKMS = CKMS::new(0.1); /// assert_eq!(0.1, ckms.error_bound()); /// ``` pub fn error_bound(&self) -> f64 { self.samples.error } /// Insert a T into the CKMS /// /// Insertion will gradulally shift the approximate quantiles. This /// implementation is biased toward fast writes and slower queries. Storage /// may grow gradually, as defined in the module-level documentation, but /// will remain bounded. pub fn insert(&mut self, v: T) { self.last_in = Some(v); self.n += 1; let v_f64: f64 = v.into(); self.cma = self.cma .map_or(Some(v_f64), |s| Some(s + ((v_f64 - s) / (self.n as f64)))); self.samples.insert(v); self.inserts = (self.inserts + 1) % self.insert_threshold; if self.inserts == 0 { self.compress() } } /// Query CKMS for a ε-approximate quantile /// /// This function returns an approximation to the true quantile-- +/- εΦn /// --for the points inserted. Argument q is valid 0. <= q <= 1.0. The first /// element of the return tuple is the rank estimation for q, the second /// element is the quantile estimation for q. The minimum and maximum /// quantile, corresponding to 0.0 and 1.0 respectively, are always known /// precisely. /// /// Return /// /// # Examples /// ``` /// use quantiles::ckms::CKMS; /// /// let mut ckms = CKMS::::new(0.001); /// for i in 0..1000 { /// ckms.insert(i as u32); /// } /// /// assert_eq!(ckms.query(0.0), Some((1, 0))); /// assert_eq!(ckms.query(0.998), Some((998, 997))); /// assert_eq!(ckms.query(1.0), Some((1000, 999))); /// ``` pub fn query(&self, q: f64) -> Option<(usize, T)> { self.samples.query(q) } /// Query CKMS for the count of its points /// /// This function returns the total number of points seen over the lifetime /// of the datastructure, _not_ the number of points currently stored in the /// structure. /// /// # Examples /// ``` /// use quantiles::ckms::CKMS; /// /// let mut ckms = CKMS::::new(0.001); /// for i in 0..1000 { /// ckms.insert(i as u32); /// } /// /// assert_eq!(ckms.count(), 1000); /// ``` pub fn count(&self) -> usize { self.n } /// Retrieve a representative vector of points /// /// This function returns a represenative sample of points from the /// CKMS. Doing so consumes the CKMS. /// /// # Examples /// ``` /// use quantiles::ckms::CKMS; /// /// let mut ckms = CKMS::::new(0.1); /// for i in 0..10 { /// ckms.insert(i as u32); /// } /// /// assert_eq!(ckms.into_vec(), vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); /// ``` pub fn into_vec(self) -> Vec { let mut res = vec![]; for inner in self.samples.data { for v in inner.data.iter().map(|x| x.v) { res.push(v); } } res } fn compress(&mut self) { self.samples.compress(); } } #[cfg(test)] mod test { use super::*; use ckms::store::invariant; use quickcheck::{QuickCheck, TestResult}; use std::f64::consts::E; fn percentile(data: &Vec, prcnt: f64) -> f64 { let idx = (prcnt * (data.len() as f64)) as usize; return data[idx]; } #[test] fn test_cma() { fn inner(data: Vec, err: f64) -> TestResult { if data.is_empty() { return TestResult::discard(); } else if !(err >= 0.0) || !(err <= 1.0) { return TestResult::discard(); } let mut ckms = CKMS::::new(err); for d in &data { ckms.insert(*d); } let sum: f64 = data.iter().sum(); let expected_mean: f64 = sum / (data.len() as f64); let mean = ckms.cma(); assert!(mean.is_some()); assert!((expected_mean - mean.unwrap()).abs() < err); return TestResult::passed(); } QuickCheck::new().quickcheck(inner as fn(Vec, f64) -> TestResult); } #[test] fn test_cma_add_assign() { fn inner(l_data: Vec, r_data: Vec, err: f64) -> TestResult { if !(err >= 0.0) || !(err <= 1.0) { return TestResult::discard(); } let mut l_ckms = CKMS::::new(err); for d in &l_data { l_ckms.insert(*d); } let mut r_ckms = CKMS::::new(err); for d in &r_data { r_ckms.insert(*d); } let sum: f64 = l_data.iter().chain(r_data.iter()).sum(); let expected_mean: f64 = sum / ((l_data.len() + r_data.len()) as f64); l_ckms += r_ckms; let mean = l_ckms.cma(); if mean.is_some() { assert!((expected_mean - mean.unwrap()).abs() < err); } return TestResult::passed(); } QuickCheck::new() .quickcheck(inner as fn(Vec, Vec, f64) -> TestResult); } #[test] fn error_nominal_test() { fn inner(mut data: Vec, prcnt: f64) -> TestResult { data.sort_by(|a, b| a.partial_cmp(b).unwrap()); if !(prcnt >= 0.0) || !(prcnt <= 1.0) { return TestResult::discard(); } else if data.len() < 1 { return TestResult::discard(); } let err = 0.001; let mut ckms = CKMS::::new(err); for d in &data { ckms.insert(*d); } if let Some((_, v)) = ckms.query(prcnt) { debug_assert!( (v - percentile(&data, prcnt)) < err, "v: {} | percentile: {} | prcnt: {} | data: {:?}", v, percentile(&data, prcnt), prcnt, data ); TestResult::passed() } else { TestResult::failed() } } QuickCheck::new().quickcheck(inner as fn(Vec, f64) -> TestResult); } #[test] fn error_nominal_with_merge_test() { fn inner(lhs: Vec, rhs: Vec, prcnt: f64, err: f64) -> TestResult { if !(prcnt >= 0.0) || !(prcnt <= 1.0) { return TestResult::discard(); } else if !(err >= 0.0) || !(err <= 1.0) { return TestResult::discard(); } else if (lhs.len() + rhs.len()) < 1 { return TestResult::discard(); } if lhs.is_empty() || rhs.is_empty() { return TestResult::discard(); } let mut data = lhs.clone(); data.append(&mut rhs.clone()); data.sort_by(|a, b| a.partial_cmp(b).unwrap()); let err = 0.001; let mut ckms = CKMS::::new(err); for d in &lhs { ckms.insert(*d); } let mut ckms_rhs = CKMS::::new(err); for d in &rhs { ckms_rhs.insert(*d); } ckms += ckms_rhs; if let Some((_, v)) = ckms.query(prcnt) { debug_assert!( (v - percentile(&data, prcnt)) < err, "v: {} | percentile: {} | prcnt: {} | data: {:?}", v, percentile(&data, prcnt), prcnt, data ); TestResult::passed() } else { TestResult::failed() } } QuickCheck::new() .quickcheck(inner as fn(Vec, Vec, f64, f64) -> TestResult); } #[test] fn n_invariant_test() { fn n_invariant(fs: Vec) -> bool { let l = fs.len(); let mut ckms = CKMS::::new(0.001); for f in fs { ckms.insert(f); } ckms.count() == l } QuickCheck::new().quickcheck(n_invariant as fn(Vec) -> bool); } #[test] fn count_sum_test() { fn inner(lhs: Vec, rhs: Vec) -> TestResult { let mut lhs_ckms = CKMS::::new(0.001); for f in lhs { lhs_ckms.insert(f); } let mut rhs_ckms = CKMS::::new(0.001); for f in rhs { rhs_ckms.insert(f); } let expected_count = lhs_ckms.count() + rhs_ckms.count(); lhs_ckms += rhs_ckms; assert_eq!(lhs_ckms.count(), expected_count); TestResult::passed() } QuickCheck::new().quickcheck(inner as fn(Vec, Vec) -> TestResult); } // prop: forany phi. (phi*n - f(phi*n, n)/2) =< r_i =< (phi*n + f(phi*n, n)/2) #[test] fn query_invariant_test() { fn query_invariant(f: f64, fs: Vec) -> TestResult { let error = 0.001; if fs.len() < 1 { return TestResult::discard(); } let phi = (1.0 / (1.0 + E.powf(f.abs()))) * 2.0; let mut ckms = CKMS::::new(error); for f in fs { ckms.insert(f); } match ckms.query(phi) { None => TestResult::passed(), /* invariant to check here? n*phi + * f > 1? */ Some((rank, _)) => { let nphi = phi * (ckms.n as f64); let fdiv2 = (invariant(nphi, error) as f64) / 2.0; TestResult::from_bool( ((nphi - fdiv2) <= (rank as f64)) || ((rank as f64) <= (nphi + fdiv2)), ) } } } QuickCheck::new() .quickcheck(query_invariant as fn(f64, Vec) -> TestResult); } #[test] fn insert_test() { let mut ckms = CKMS::::new(0.001); for i in 0..2 { ckms.insert(i as f64); } assert_eq!(0.0, ckms.samples[0].v); assert_eq!(1.0, ckms.samples[1].v); } // prop: v_i-1 < v_i =< v_i+1 #[test] fn asc_samples_test() { fn asc_samples(fs: Vec) -> TestResult { let mut ckms = CKMS::::new(0.001); let fsc = fs.clone(); for f in fs { ckms.insert(f); } if ckms.samples.len() == 0 && fsc.len() == 0 { return TestResult::passed(); } let mut cur = ckms.samples[0].v; for ent in ckms.samples.iter() { let s = ent.v; if s < cur { return TestResult::failed(); } cur = s; } TestResult::passed() } QuickCheck::new().quickcheck(asc_samples as fn(Vec) -> TestResult); } // prop: forall i. g_i + delta_i =< f(r_i, n) #[test] fn f_invariant_test() { fn f_invariant(fs: Vec) -> TestResult { let error = 0.001; let mut ckms = CKMS::::new(error); for f in fs { ckms.insert(f); } let s = ckms.samples.len(); let mut r = 0; for i in 1..s { let ref prev = ckms.samples[i - 1]; let ref cur = ckms.samples[i]; r += prev.g; let res = (cur.g + cur.delta) <= invariant(r as f64, error); if !res { println!( "{:?} <= {:?}", cur.g + cur.delta, invariant(r as f64, error) ); println!("samples: {:?}", ckms.samples); return TestResult::failed(); } } TestResult::passed() } QuickCheck::new().quickcheck(f_invariant as fn(Vec) -> TestResult); } #[test] fn compression_test() { let mut ckms = CKMS::::new(0.1); for i in 1..10000 { ckms.insert(i); } let l = ckms.samples.len(); let n = ckms.count(); assert_eq!(9999, n); assert_eq!(320, l); } // prop: post-compression, samples is bounded above by O(1/e log^2 en) #[test] fn compression_bound_test() { fn compression_bound(fs: Vec) -> TestResult { if fs.len() < 15 { return TestResult::discard(); } let mut ckms = CKMS::::new(0.001); for f in fs { ckms.insert(f); } ckms.compress(); let s = ckms.samples.len() as i64; let bound = ((1.0 / ckms.error_bound()) * (ckms.error_bound() * (ckms.count() as f64)).log10().powi(2)) .ceil() as i64; // We have to choose an arbitrary, lowish constant for bound // invalidation buffer. This is because I don't have a precise // boundary. 1024 samples worth of slop isn't bad, I guess. if !(s <= bound) && !((s - bound).abs() < 1_024) { println!( "error: {:?} n: {:?} log10: {:?}", ckms.error_bound(), ckms.count() as f64, (ckms.error_bound() * (ckms.count() as f64)).log10().powi(2) ); println!("{:?} <= {:?}", s, bound); return TestResult::failed(); } TestResult::passed() } QuickCheck::new().quickcheck(compression_bound as fn(Vec) -> TestResult); } #[test] fn test_basics() { let mut ckms = CKMS::::new(0.001); for i in 1..1001 { ckms.insert(i as i32); } assert_eq!(ckms.query(0.00), Some((1, 1))); assert_eq!(ckms.query(0.05), Some((50, 50))); assert_eq!(ckms.query(0.10), Some((100, 100))); assert_eq!(ckms.query(0.15), Some((150, 150))); assert_eq!(ckms.query(0.20), Some((200, 200))); assert_eq!(ckms.query(0.25), Some((250, 250))); assert_eq!(ckms.query(0.30), Some((300, 300))); assert_eq!(ckms.query(0.35), Some((350, 350))); assert_eq!(ckms.query(0.40), Some((400, 400))); assert_eq!(ckms.query(0.45), Some((450, 450))); assert_eq!(ckms.query(0.50), Some((500, 500))); assert_eq!(ckms.query(0.55), Some((550, 550))); assert_eq!(ckms.query(0.60), Some((600, 600))); assert_eq!(ckms.query(0.65), Some((650, 650))); assert_eq!(ckms.query(0.70), Some((700, 700))); assert_eq!(ckms.query(0.75), Some((750, 750))); assert_eq!(ckms.query(0.80), Some((800, 800))); assert_eq!(ckms.query(0.85), Some((850, 850))); assert_eq!(ckms.query(0.90), Some((900, 900))); assert_eq!(ckms.query(0.95), Some((950, 950))); assert_eq!(ckms.query(0.99), Some((990, 990))); assert_eq!(ckms.query(1.00), Some((1000, 1000))); } } quantiles-0.7.1/src/ckms/store.rs010064400007660000024000000373101321360537700152050ustar0000000000000000//! store - a 'poor man's skiplist' for CKMS //! //! The CKMS requires a storage data structure that has cheap inserts and //! constant-ish loookups. That's because: //! //! * insertion implies search //! * compression implies search, shifting of data //! * query is search //! //! Prior to 0.7 CKMS used a Vec for storing its samples. This worked well //! enough for small collections of samples, but fell over once you got past //! around 50k on account of the expense of shifting data around, performing a //! search for every insert. //! //! What we've done in this module is build a "poor man's" skiplist -- //! constructed of nested Vecs of bounded sized -- which contains at each 'node' //! all the information we need to perform an insert, as if we had examined //! every sample in the block. Anyhow, we'll get into it below. use std::fmt; use std::ops::{Index, IndexMut}; use ckms::entry::Entry; /// The all-important CKMS invariant. pub fn invariant(r: f64, error: f64) -> u32 { let i = (2.0 * error * r).floor() as u32; if i == 0 { 1 } else { i } } /// Inner is the 'node' of our poor-man's skiplist. Each Inner stores a block of /// samples of bounded size -- controlled by `inner_cap` set on `Store` -- and a /// `g_sum`. The CKMS algorithm requires samples to be stored in sorted order /// and the insertion procedure builds up a `g_sum`, a sum of the ranks of each /// sample seen. To avoid re-computing this `g_sum` as we search for our /// insertion spot we instead keep a `g_sum` on Inner, meaning if we determine /// that the block will not be inserted into we can just pull `g_sum` and avoid /// inspecting every sample in the block. This implies a touch more work on /// every insertion but we come out well ahead not inspecting every sample, even /// so. #[derive(Clone, PartialEq, Debug)] #[cfg_attr(feature = "serde_support", derive(Serialize, Deserialize))] pub struct Inner where T: PartialEq, { pub data: Vec>, g_sum: u32, } impl Inner where T: PartialEq + PartialOrd + Copy, { pub fn len(&self) -> usize { self.data.len() } /// split_off is patterned on the operation of `Vec::split_off`. /// /// The notion here is when an Inner goes over `Store::inner_cap` we need to /// split the samples that fall over `inner_cap` into a new Inner. This /// keeps our `inner_cap` bound going and reduces the O(n) burden of /// inserting into a Vec. /// /// The correct `g_sum` is maintained for both sides in the split. pub fn split_off(&mut self, index: usize) -> Self { assert!(index < self.data.len()); let nxt = self.data.split_off(index); let nxt_g_sum = nxt.iter().fold(0, |acc, x| acc + x.g); self.g_sum -= nxt_g_sum; Inner { data: nxt, g_sum: nxt_g_sum, } } } #[derive(Clone, PartialEq, Debug)] #[cfg_attr(feature = "serde_support", derive(Serialize, Deserialize))] pub struct Store where T: PartialEq, { /// The way CKMS works, we are allowed to respond back to a user query /// inaccurately. Just, with known inaccuracy. That's what this is, the /// known inaccuracy. What's neat is we can perform compression on the /// stored samples so long as we keep within this error bound. pub error: f64, /// Our collction of samples. See documentation of Inner for more details. pub data: Vec>, inner_cap: usize, // maximum size of an Inner len: usize, // samples currently stored n: usize, // total samples ever stored } impl Store where T: PartialEq + PartialOrd + Copy, { pub fn new(inner_cap: usize, error: f64) -> Store { assert!(inner_cap != 0); let data = Inner { data: Vec::with_capacity(inner_cap), g_sum: 0, }; Store { error: error, data: vec![data], inner_cap: inner_cap, len: 0, n: 0, } } /// Insert a point into the Store pub fn insert(&mut self, element: T) -> () where T: fmt::Debug, { // This function is a touch repetative. There are three possible // conditions when we insert a point: // // * point goes to the front // * point goes to the back // * point goes somewhere in the middle // // Insertion into the middle is the most expensive. A point inserted at // the front or back has a 'delta' -- see the referenced paper for full // details -- of 0. A point that goes into the middle has a delta // derived from the invariant, the rank of the inserted sample. That // implies a search. This store is able to skip a linear seek by // examining the max element of Inner caches, their associated maximum // rank, g_sum. // insert at the front if self.data[0].data.is_empty() || (self.data[0].data[0].v >= element) { self.data[0].data.insert( 0, Entry { v: element, g: 1, delta: 0, }, ); self.data[0].g_sum += 1; self.n += 1; self.len += 1; if self.data[0].len() > self.inner_cap { let nxt = self.data[0].split_off(self.inner_cap); if self.data.len() > 1 { self.data.insert(1, nxt); } else { self.data.push(nxt); } } return; } let mut outer_idx = self.data.len() - 1; let mut inner_idx = self.data[outer_idx].len() - 1; // insert at the back if self.data[outer_idx].data[inner_idx].v < element { self.data[outer_idx].data.push(Entry { v: element, g: 1, delta: 0, }); self.data[outer_idx].g_sum += 1; self.n += 1; self.len += 1; if self.data[outer_idx].len() > self.inner_cap { let nxt = self.data[outer_idx].split_off(self.inner_cap); self.data.push(nxt); } return; } // insert in the middle outer_idx = 0; inner_idx = 0; let mut r = 0; // Seek the outer_idx forward to the right cache line while outer_idx < self.data.len() { // The element for insertion is larger than the largest in the // present inner cache. In that case, we kick the outer_idx up and // capture the g_sum into our r. let mx = self.data[outer_idx].data.len(); if element > self.data[outer_idx].data[mx - 1].v { outer_idx += 1; r += self.data[outer_idx].g_sum; } else { break; } } // Seek the inner_idx forward to the right location while inner_idx < self.data[outer_idx].data.len() { // The inner cache for insertion is here at outer_cache. We now seek // inner_idx forward while the current inner_idx is < than the // element for insertion. if self.data[outer_idx].data[inner_idx].v < element { inner_idx += 1; r += 1; } else { break; } } self.data[outer_idx].data.insert( inner_idx, Entry { v: element, g: 1, delta: invariant(f64::from(r), self.error) - 1, }, ); self.data[outer_idx].g_sum += 1; if self.data[outer_idx].len() > self.inner_cap { let nxt = self.data[outer_idx].split_off(self.inner_cap); self.data.insert(outer_idx + 1, nxt); } self.n += 1; self.len += 1; } pub fn is_empty(&self) -> bool { self.len == 0 } /// Total stored samples /// /// This value will fluctuate as compression happens. pub fn len(&self) -> usize { self.len } #[cfg(test)] /// Total samples, ever /// /// This value will never decrease and may or may not be equivalent to /// `Self::len` pub fn count(&self) -> usize { self.n } pub fn compress(&mut self) { if self.len() < 3 { return; } let mut cur_outer_idx = 0; let mut cur_inner_idx = 0; let mut nxt_outer_idx = 0; let mut nxt_inner_idx = 1; let mut r: u32 = 1; while cur_outer_idx < self.data.len() { let cur_g = self.data[cur_outer_idx][cur_inner_idx].g; // If the nxt_inner_idx has gone off the rails then it's time for us // to move up to the next inner cache for the next point. if nxt_inner_idx >= self.data[nxt_outer_idx].len() { nxt_inner_idx = 0; nxt_outer_idx += 1; // When nxt_outer_idx goes off the end we've run out of samples // to compress. if nxt_outer_idx >= self.data.len() { break; } } let nxt_v = self.data[nxt_outer_idx][nxt_inner_idx].v; let nxt_g = self.data[nxt_outer_idx][nxt_inner_idx].g; let nxt_delta = self.data[nxt_outer_idx][nxt_inner_idx].delta; if cur_g + nxt_g + nxt_delta <= invariant(f64::from(r), self.error) { self.data[cur_outer_idx][cur_inner_idx].v = nxt_v; self.data[cur_outer_idx][cur_inner_idx].g += nxt_g; self.data[cur_outer_idx][cur_inner_idx].delta = nxt_delta; // If the two outer indexes don't match then we've 'moved' a g // from one inner cache to another. So, we scoot them. if cur_outer_idx != nxt_outer_idx { self.data[nxt_outer_idx].g_sum -= nxt_g; self.data[cur_outer_idx].g_sum += nxt_g; } self.data[nxt_outer_idx].data.remove(nxt_inner_idx); // Now that we've collapsed a point it's possible that we can // collapse the next next point into the current one as well. We // leave the indexes well enough alone as we've just removed an // item from the present inner cache. self.len -= 1; } else { // If we haven't collapsed any points we move the current // indexes to the next indexes. We also scoot up the next INNER // index, taking care to not adjust the outer index. We avoid // adjusting the outer index because it's possible we don't need // to move to a new inner cache yet. cur_outer_idx = nxt_outer_idx; cur_inner_idx = nxt_inner_idx; nxt_inner_idx += 1; } r += 1; } // It's possible after several compression passes that we'll leave tiny // inner caches in place. We don't want this. We'll move pairwise // through the inner caches and combine those that are contiguous and // fit within inner_cap. cur_outer_idx = 0; while (self.data.len() >= 1) && (cur_outer_idx < (self.data.len() - 1)) { if self.data[cur_outer_idx].data.len() + self.data[cur_outer_idx + 1].data.len() <= self.inner_cap { let mut nxt = self.data.remove(cur_outer_idx + 1); let cur = &mut self.data[cur_outer_idx]; cur.g_sum += nxt.g_sum; cur.data.append(&mut nxt.data); } else { cur_outer_idx += 1; } } } pub fn query(&self, q: f64) -> Option<(usize, T)> { if self.is_empty() { return None; } let mut r: u32 = 0; let s = self.len(); let nphi = q * (self.n as f64); for i in 1..s { // TODO indexing is no longer constant, make sure we don't do two // seeking indexes let prev = &self[i - 1]; let cur = &self[i]; r += prev.g; let lhs = f64::from(r + cur.g + cur.delta); let inv = invariant(nphi, self.error); let rhs = nphi + (f64::from(inv) / 2.0); if lhs > rhs { return Some((r as usize, prev.v)); } } let v = self[s - 1].v; Some((s, v)) } #[cfg(test)] pub fn iter(&self) -> StoreIter { StoreIter { store: &self.data, outer_idx: 0, inner_idx: 0, } } } impl IndexMut for Inner where T: PartialEq, { fn index_mut(&mut self, index: usize) -> &mut Entry { &mut self.data[index] } } impl Index for Inner where T: PartialEq, { type Output = Entry; fn index(&self, index: usize) -> &Self::Output { &self.data[index] } } impl IndexMut for Store where T: PartialEq + PartialOrd + Copy, { fn index_mut(&mut self, index: usize) -> &mut Entry { let mut outer_idx = 0; let mut idx = index; while idx >= self.data[outer_idx].len() { idx -= self.data[outer_idx].len(); outer_idx += 1; } &mut self.data[outer_idx][idx] } } impl Index for Store where T: PartialEq + PartialOrd + Copy, { type Output = Entry; fn index(&self, index: usize) -> &Self::Output { let mut outer_idx = 0; let mut idx = index; while idx >= self.data[outer_idx].len() { idx -= self.data[outer_idx].len(); outer_idx += 1; } &self.data[outer_idx][idx] } } #[cfg(test)] pub struct StoreIter<'a, T> where T: 'a + PartialEq, { store: &'a Vec>, outer_idx: usize, inner_idx: usize, } #[cfg(test)] impl<'a, T> Iterator for StoreIter<'a, T> where T: PartialEq + Copy + PartialOrd + fmt::Debug, { type Item = &'a Entry; fn next(&mut self) -> Option { while self.outer_idx < self.store.len() { if self.inner_idx < self.store[self.outer_idx].len() { let ret = &self.store[self.outer_idx][self.inner_idx]; self.inner_idx += 1; return Some(ret); } self.inner_idx = 0; self.outer_idx += 1; } None } } #[cfg(test)] mod test { use super::*; use quickcheck::{QuickCheck, TestResult}; #[test] fn inner_caches_test() { let mut store = Store::::new(10, 0.99); for i in 0..100 { store.insert(i); } assert_eq!(10, store.data.len()); } #[test] fn compression_test() { let mut store = Store::::new(100, 0.1); for i in 0..10_000 { store.insert(i); } store.compress(); assert_eq!(10_000, store.count()); assert_eq!(42, store.len()); } #[test] fn obey_inner_cap() { fn inner(data: Vec, inner_cap: usize, err: f64) -> TestResult { if data.is_empty() { return TestResult::discard(); } else if inner_cap == 0 { return TestResult::discard(); } else if !(err >= 0.0) || !(err <= 1.0) { return TestResult::discard(); } let mut store = Store::::new(inner_cap, err); for d in &data { store.insert(*d); } for inner in store.data { assert!(inner.len() <= store.inner_cap); } return TestResult::passed(); } QuickCheck::new().quickcheck(inner as fn(Vec, usize, f64) -> TestResult); } } quantiles-0.7.1/src/greenwald_khanna.rs010064400007660000024000000243641316256632300164110ustar0000000000000000//! Greenwald Khanna calculates epsilon-approximate quantiles. //! If the desired quantile is phi, the epsilon-approximate //! quantile is any element in the range of elements that rank //! between `lbound((phi-epsilon) x N)` and `lbound((phi+epsilon) x N)` //! //! terminology from the paper: //! //! * S: set of observations //! * n: number of observations in S //! * v[i]: observation i in S //! * r: rank of observation in S from 1 to n. //! * `r_min(v[i])`: lower bound on rank r of v[i] //! * `r_max(v[i])`: upper bound on rank r of v[i] //! * `g[i] = r_min(v[i]) - r_min(v[i - 1])` //! * `delta[i] = r_max(v[i]) - r_min(v[i])` //! * `t[i] = tuple(v[i], g[i], delta[i])` //! * phi: quantile as a real number in the range [0,1] //! * r: ubound(phi * n) //! //! identities: //! //! * `r_min(v[i]) = forall j<=i sum of g[j]` //! * `r_max(v[i]) = ( forall j<=i sum of g[j] ) + delta[i]` //! * g[i] + delta[i] - 1 is an upper bound on the total number of observations //! * between v[i] and v[i-1] //! * sum of g[i] = n //! //! results: //! //! * `max_i(g[i] + delta[i]) <= 2 * epsilon * n` //! * a tuple is full if g[i] + delta[i] = floor(2 * epsilon * n) //! //! `@inproceedings{Greenwald:2001:SOC:375663.375670, //! author = {Greenwald, Michael and Khanna, Sanjeev}, //! title = {Space-efficient Online Computation of Quantile Summaries}, //! booktitle = {Proceedings of the 2001 ACM SIGMOD International //! Conference //! on Management of Data}, //! series = {SIGMOD '01}, //! year = {2001}, //! isbn = {1-58113-332-4}, //! location = {Santa Barbara, California, USA}, //! pages = {58--66}, //! numpages = {9}, //! url = {http://doi.acm.org/10.1145/375663.375670}, //! doi = {10.1145/375663.375670}, //! acmid = {375670}, //! publisher = {ACM}, //! address = {New York, NY, USA}, //! }` //! //! # Examples //! //! ``` //! use quantiles::greenwald_khanna::*; //! //! let epsilon = 0.01; //! //! let mut stream = Stream::new(epsilon); //! //! let n = 1001; //! for i in 1..n { //! stream.insert(i); //! } //! let in_range = |phi: f64, value: u32| { //! let lower = ((phi - epsilon) * (n as f64)) as u32; //! let upper = ((phi + epsilon) * (n as f64)) as u32; //! (epsilon > phi || lower <= value) && value <= upper //! }; //! assert!(in_range(0f64, *stream.quantile(0f64))); //! assert!(in_range(0.1f64, *stream.quantile(0.1f64))); //! assert!(in_range(0.2f64, *stream.quantile(0.2f64))); //! assert!(in_range(0.3f64, *stream.quantile(0.3f64))); //! assert!(in_range(0.4f64, *stream.quantile(0.4f64))); //! assert!(in_range(1f64, *stream.quantile(1f64))); //! ``` use std::cmp; /// Locates the proper position of v in a vector vs /// such that when v is inserted at position i, /// it is less then the element at i+1 if any, /// and greater than or equal to the element at i-1 if any. pub fn find_insert_pos(vs: &[T], v: &T) -> usize where T: Ord, { if vs.len() <= 10 { return find_insert_pos_linear(vs, v); } let middle = vs.len() / 2; let pivot = &vs[middle]; if v < pivot { find_insert_pos(&vs[0..middle], v) } else { middle + find_insert_pos(&vs[middle..], v) } } /// Locates the proper position of v in a vector vs /// such that when v is inserted at position i, /// it is less then the element at i+1 if any, /// and greater than or equal to the element at i-1 if any. /// Works by scanning the slice from start to end. pub fn find_insert_pos_linear(vs: &[T], v: &T) -> usize where T: Ord, { for (i, vi) in vs.iter().enumerate() { if v < vi { return i; } } vs.len() } /// 3-tuple of a value v[i], g[i] and delta[i]. #[derive(Eq, Ord, Debug)] pub struct Tuple where T: Ord, { /// v[i], an observation in the set of observations pub v: T, /// the difference between the rank lowerbounds of t[i] and t[i-1] /// g = r_min(v[i]) - r_min(v[i - 1]) pub g: usize, /// the difference betweeh the rank upper and lower bounds for this tuple pub delta: usize, } impl Tuple where T: Ord, { /// Creates a new instance of a Tuple pub fn new(v: T, g: usize, delta: usize) -> Tuple { Tuple { v: v, g: g, delta: delta, } } } impl PartialEq for Tuple where T: Ord, { fn eq(&self, other: &Self) -> bool { self.v == other.v } } impl PartialOrd for Tuple where T: Ord, { fn partial_cmp(&self, other: &Self) -> Option { self.v.partial_cmp(&other.v) } } /// The summary S of the observations seen so far. #[derive(Debug)] pub struct Stream where T: Ord, { /// An ordered sequence of the selected observations summary: Vec>, /// The error factor epsilon: f64, /// The number of observations n: usize, } impl Stream where T: Ord, { /// Creates a new instance of a Stream pub fn new(epsilon: f64) -> Stream { Stream { summary: vec![], epsilon: epsilon, n: 0, } } /// Locates the correct position in the summary data set /// for the observation v, and inserts a new tuple (v,1,floor(2en)) /// If v is the new minimum or maximum, then instead insert /// tuple (v,1,0). pub fn insert(&mut self, v: T) { let mut t = Tuple::new(v, 1, 0); let pos = find_insert_pos(&self.summary, &t); if pos != 0 && pos != self.summary.len() { t.delta = (2f64 * self.epsilon * (self.n as f64).floor()) as usize; } self.summary.insert(pos, t); self.n += 1; if self.should_compress() { self.compress(); } } /// Compute the epsilon-approximate phi-quantile /// from the summary data structure. pub fn quantile(&self, phi: f64) -> &T { assert!(self.summary.len() >= 1); assert!(phi >= 0f64 && phi <= 1f64); let r = (phi * self.n as f64).floor() as usize; let en = (self.epsilon * self.n as f64) as usize; let first = &self.summary[0]; let mut prev = &first.v; let mut prev_rmin = first.g; for t in self.summary.iter().skip(1) { let rmax = prev_rmin + t.g + t.delta; if rmax > r + en { return prev; } prev_rmin += t.g; prev = &t.v; } prev } fn should_compress(&self) -> bool { let period = (1f64 / (2f64 * self.epsilon)).floor() as usize; self.n % period == 0 } fn compress(&mut self) { let s = self.s(); for i in (1..(s - 1)).rev() { if self.can_delete(i) { self.delete(i); } } } fn can_delete(&self, i: usize) -> bool { assert!(self.summary.len() >= 2); assert!(i < self.summary.len() - 1); let t = &self.summary[i]; let tnext = &self.summary[i + 1]; let p = self.p(); let safety_property = t.g + tnext.g + tnext.delta < p; let optimal = Self::band(t.delta, p) <= Self::band(tnext.delta, p); safety_property && optimal } /// Remove the ith tuple from the summary. /// Panics if i is not in the range [0,summary.len() - 1) /// Only permitted if g[i] + g[i+1] + delta[i+1] < 2 * epsilon * n fn delete(&mut self, i: usize) { assert!(self.summary.len() >= 2); assert!(i < self.summary.len() - 1); let t = self.summary.remove(i); let tnext = &mut self.summary[i]; tnext.g += t.g; } /// Compute which band a delta lies in. fn band(delta: usize, p: usize) -> usize { assert!(p >= delta); let diff = p - delta + 1; (diff as f64).log(2f64).floor() as usize } /// Calculate p = 2epsilon * n pub fn p(&self) -> usize { (2f64 * self.epsilon * (self.n as f64)).floor() as usize } /// The number of observations inserted into the stream. pub fn n(&self) -> usize { self.n } /// Indication of the space usage of the summary data structure /// Returns the number of tuples in the summary /// data structure. pub fn s(&self) -> usize { self.summary.len() } } #[cfg(test)] mod test { use super::*; use std::ops::Range; #[test] fn test_find_insert_pos() { let mut vs = vec![]; for v in 0..10 { vs.push(v); } for v in 0..10 { assert_eq!(find_insert_pos_linear(&vs, &v), v + 1); } } fn get_quantile_for_range(r: &Range, phi: f64) -> u32 { (phi * ((r.end - 1) - r.start) as f64).floor() as u32 + r.start } fn get_quantile_bounds_for_range(r: Range, phi: f64, epsilon: f64) -> (u32, u32) { let lower = get_quantile_for_range(&r, (phi - epsilon).max(0f64)); let upper = get_quantile_for_range(&r, phi + epsilon); (lower, upper) } fn quantile_in_bounds(r: Range, s: &Stream, phi: f64, epsilon: f64) -> bool { let approx_quantile = *s.quantile(phi); let (lower, upper) = get_quantile_bounds_for_range(r, phi, epsilon); // println!("approx_quantile={} lower={} upper={} phi={} epsilon={}", // approx_quantile, lower, upper, phi, epsilon); approx_quantile >= lower && approx_quantile <= upper } #[test] fn test_basics() { let epsilon = 0.01; let mut stream = Stream::new(epsilon); for i in 1..1001 { stream.insert(i); } for phi in 0..100 { assert!(quantile_in_bounds( 1..1001, &stream, (phi as f64) / 100f64, epsilon )); } } quickcheck! { fn find_insert_pos_log_equals_find_insert_pos_linear(vs: Vec) -> bool { let mut vs = vs; vs.sort(); for v in -100..100 { if find_insert_pos(&vs, &v) != find_insert_pos_linear(&vs, &v) { return false; } } true } fn test_gk(vs: Vec) -> bool { let mut s = Stream::new(0.25); for v in vs { s.insert(v); } true } } } quantiles-0.7.1/src/histogram.rs010064400007660000024000000567501316256646500151310ustar0000000000000000//! 'histogram' approximates a distribution calculation by counting the number //! of times samples fall into pre-configured bins. This implementation does not //! require bins to be equally sized. The user must specify upper bounds on bins //! via `Bounds`. The implementation includes a +Inf bound automatically. //! //! Storage cost is proportional to the number of bins. The implementation is //! biased in favor of writes. use std::cmp; use std::fmt; use std::ops; use std::slice; #[derive(Debug, Copy, Clone)] #[cfg_attr(feature = "serde_support", derive(Serialize, Deserialize))] /// The upper bound for each `Histogram` bins. The user is responsible for /// determining effective bins for their use-case. pub enum Bound where T: Copy, { /// A finite upper bound. Finite(T), /// A positively infinite upper bound. We cheat when doing ordering and say /// that PosInf == PosInf. This is not strictly true but it's true enough /// for us. PosInf, } impl PartialOrd for Bound where T: Copy + cmp::PartialOrd, { fn partial_cmp(&self, other: &Bound) -> Option { match *self { Bound::Finite(ref x) => match *other { Bound::Finite(y) => x.partial_cmp(&y), Bound::PosInf => Some(cmp::Ordering::Less), }, Bound::PosInf => match *other { Bound::Finite(_) => Some(cmp::Ordering::Greater), Bound::PosInf => Some(cmp::Ordering::Equal), }, } } } impl PartialEq for Bound where T: Copy + cmp::PartialEq, { fn eq(&self, other: &Bound) -> bool { match *self { Bound::Finite(ref x) => match *other { Bound::Finite(y) => y.eq(x), Bound::PosInf => false, }, Bound::PosInf => match *other { Bound::Finite(_) => false, Bound::PosInf => true, }, } } } impl ops::AddAssign for Histogram where T: Copy + cmp::PartialOrd + fmt::Debug + ops::Add, { fn add_assign(&mut self, rhs: Histogram) { let lhs_sum = self.sum; let rhs_sum = rhs.sum; let sum = match (lhs_sum, rhs_sum) { (None, None) => None, (None, Some(y)) => Some(y), (Some(x), None) => Some(x), (Some(x), Some(y)) => Some(x + y), }; self.sum = sum; self.count += rhs.count; for (i, bnd) in rhs.iter().enumerate() { assert_eq!(self.bins[i].0, bnd.0); self.bins[i].1 += bnd.1; } } } /// A binning histogram of unequal, pre-defined bins /// /// This implementation performs summation over `T`. It's possible that this /// summation will overflow, a crash condition in Rust. Unfortunately there's no /// generic saturating / checked add over a generic. Please take care when /// inserting into Histogram for small `T`s. #[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "serde_support", derive(Serialize, Deserialize))] pub struct Histogram where T: Copy, { count: usize, sum: Option, bins: Vec<(Bound, usize)>, } /// Struct to implement Iterator over Histogram #[derive(Debug)] pub struct Iter<'a, T> where T: 'a + Copy, { rx: slice::Iter<'a, (Bound, usize)>, } impl<'a, T> Iterator for Iter<'a, T> where T: Copy, { type Item = &'a (Bound, usize); fn next(&mut self) -> Option { self.rx.next() } } #[derive(Debug, Copy, Clone)] /// Construction errors /// /// `Histogram` is a little finicky when you construct it. We signal errors out /// to the user with this enumeration. pub enum Error { /// The bounds given to Histogram are empty. We need bounds. BoundsEmpty, /// The bounds given to Histogram are not sorted. They must be. BoundsNotSorted, } fn is_sorted(bounds: &[T]) -> bool where T: cmp::PartialOrd + fmt::Debug, { let mut prev = None; for i in bounds { if prev.is_none() { prev = Some(i); continue; } let p: &T = prev.unwrap(); match i.partial_cmp(p) { Some(cmp::Ordering::Less) => { return false; } _ => { prev = Some(i); } } } true } impl Histogram where T: Copy + cmp::PartialOrd + fmt::Debug, { /// Create a new Histogram /// /// This Histogram is a binning histogram of unequal bins. The user is /// responsible for defining the upper bounds of bins. Users are able to /// query bin counts without exact bins but should be aware that the results /// will only be approximate unless the explicit bin is used. See `total_*` /// functions for details. /// /// # Examples /// ``` /// use quantiles::histogram::{Bound, Histogram}; /// /// let mut histo = Histogram::::new(vec![10, 256, 1987, /// 1990]).unwrap(); /// for i in 0..2048 { /// histo.insert(i as u64); /// } /// /// assert_eq!(histo.total_above(Bound::Finite(0)), 2048); /// assert_eq!(histo.total_above(Bound::Finite(11)), 2037); /// assert_eq!(histo.total_above(Bound::Finite(10)), 2037); /// assert_eq!(histo.total_between(Bound::Finite(1987), /// Bound::Finite(1990)), 3); /// assert_eq!(histo.total_below(Bound::PosInf), 2048); /// ``` pub fn new(bounds: Vec) -> Result, Error> { if bounds.is_empty() { return Err(Error::BoundsEmpty); } if !is_sorted(&bounds) { return Err(Error::BoundsNotSorted); } let mut bins: Vec<(Bound, usize)> = bounds .into_iter() .map(|x| (Bound::Finite(x), usize::min_value())) .collect(); let cap: (Bound, usize) = (Bound::PosInf, 0); bins.push(cap); Ok(Histogram { count: 0, sum: None, bins: bins, }) } /// Insert a T into the Histogram /// /// Insertion will search for the appropriate bin and increase the counter /// found there. If two bins `a` and `b` form a bin with `a < b` then `X` /// will be placed into that bin if `a < X <= b`. /// /// # Examples /// ``` /// use quantiles::histogram::{Bound, Histogram}; /// /// let mut histo = Histogram::::new(vec![10, 100]).unwrap(); /// histo.insert(99 as u64); /// histo.insert(100 as u64); /// /// assert_eq!(histo.total_between(Bound::Finite(10), Bound::Finite(100)), /// 2); /// ``` pub fn insert(&mut self, value: T) -> () where T: ops::Add, { self.sum = match self.sum { None => Some(value), Some(x) => Some(x + value), }; let mut idx = 0; let val_bound = Bound::Finite(value); for &(ref bound, _) in &self.bins { match bound.partial_cmp(&val_bound) { Some(cmp::Ordering::Greater) | Some(cmp::Ordering::Equal) => { break; } Some(cmp::Ordering::Less) | None => idx += 1, } } self.bins[idx].1 += 1; self.count += 1; } /// Returns the total number of items 'stored' in the histogram /// /// # Examples /// ``` /// use quantiles::histogram::{Bound, Histogram}; /// /// let mut histo = Histogram::::new(vec![10, 256, 1987, /// 1990]).unwrap(); /// for i in 0..2048 { /// histo.insert(i as u64); /// } /// /// assert_eq!(histo.count(), 2048); /// ``` pub fn count(&self) -> usize { self.count } /// Returns the sum of the items 'stored' in the histogram /// /// # Examples /// ``` /// use quantiles::histogram::Histogram; /// /// let mut histo = Histogram::::new(vec![10, 256, 1987, /// 1990]).unwrap(); /// /// assert_eq!(histo.sum(), None); /// /// for i in 0..2048 { /// histo.insert(i as u64); /// } /// /// assert_eq!(histo.sum(), Some(2096128)); /// ``` pub fn sum(&self) -> Option { self.sum } /// Total number of items below supplied upper_bound /// /// # Examples /// ``` /// use quantiles::histogram::{Bound, Histogram}; /// /// let mut histo = Histogram::::new(vec![10, 256, 1987, /// 1990]).unwrap(); /// for i in 0..2048 { /// histo.insert(i as u64); /// } /// /// assert_eq!(histo.total_below(Bound::PosInf), 2048); /// ``` pub fn total_below(&self, upper: Bound) -> usize { let mut count = 0; for &(ref bound, cnt) in &self.bins { if bound > &upper { break; } else { count += cnt; } } count } /// Total number of items above supplied lower_bound /// /// # Examples /// ``` /// use quantiles::histogram::{Bound, Histogram}; /// /// let mut histo = Histogram::::new(vec![10, 256, 1987, /// 1990]).unwrap(); /// for i in 0..2048 { /// histo.insert(i as u64); /// } /// /// assert_eq!(histo.total_above(Bound::Finite(0)), 2048); /// assert_eq!(histo.total_above(Bound::Finite(11)), 2037); /// assert_eq!(histo.total_above(Bound::Finite(10)), 2037); /// ``` pub fn total_above(&self, lower: Bound) -> usize { let mut count = 0; for &(ref bound, cnt) in &self.bins { if bound <= &lower { continue; } count += cnt; } count } /// Total number of items between [lower_bound, upper_bound) /// /// # Examples /// ``` /// use quantiles::histogram::{Bound, Histogram}; /// /// let mut histo = Histogram::::new(vec![10, 256, 1987, /// 1990]).unwrap(); /// for i in 0..2048 { /// histo.insert(i as u64); /// } /// /// assert_eq!(histo.total_between(Bound::Finite(1987), /// Bound::Finite(1990)), 3); /// ``` pub fn total_between(&self, lower: Bound, upper: Bound) -> usize { if lower >= upper { return 0; } let mut count = 0; for &(ref bound, cnt) in &self.bins { if bound > &lower && bound <= &upper { count += cnt; } } count } /// Iterate over the bounds and counts of bounds /// # Examples /// ``` /// use quantiles::histogram::{Bound, Histogram}; /// /// let mut histo = Histogram::::new(vec![10, 256, 1987, /// 1990]).unwrap(); /// for i in 0..2048 { /// histo.insert(i as u64); /// } /// /// let expected: Vec<(Bound, usize)> = vec![(Bound::Finite(10), 11), /// (Bound::Finite(256), 246), (Bound::Finite(1987), 1731), /// (Bound::Finite(1990), 3), (Bound::PosInf, 57)]; /// let actual: Vec<(Bound, usize)> = histo.iter().map(|x| /// *x).collect(); /// assert_eq!(expected[0], actual[0]); /// assert_eq!(expected[1], actual[1]); /// assert_eq!(expected[2], actual[2]); /// assert_eq!(expected[3], actual[3]); /// assert_eq!(expected[4], actual[4]); /// ``` pub fn iter(&self) -> Iter { Iter { rx: self.bins.iter(), } } /// Convert a Histogram into an array of tuples /// /// # Examples /// ``` /// use quantiles::histogram::{Bound, Histogram}; /// /// let mut histo = Histogram::::new(vec![10, 256, 1987, /// 1990]).unwrap(); /// for i in 0..2048 { /// histo.insert(i as u64); /// } /// /// let expected: Vec<(Bound, usize)> = vec![(Bound::Finite(10), 11), /// (Bound::Finite(256), 246), (Bound::Finite(1987), 1731), /// (Bound::Finite(1990), 3), (Bound::PosInf, 57)]; /// let actual: Vec<(Bound, usize)> = histo.into_vec(); /// assert_eq!(expected[0], actual[0]); /// assert_eq!(expected[1], actual[1]); /// assert_eq!(expected[2], actual[2]); /// assert_eq!(expected[3], actual[3]); /// assert_eq!(expected[4], actual[4]); /// ``` pub fn into_vec(self) -> Vec<(Bound, usize)> { self.iter().cloned().collect() } } #[cfg(test)] mod test { use super::*; use quickcheck::{QuickCheck, TestResult}; #[test] fn test_addassign() { fn inner(mut bounds: Vec, lpyld: Vec, rpyld: Vec) -> TestResult { if bounds.is_empty() { return TestResult::discard(); } bounds.sort_by(|a, b| a.partial_cmp(b).unwrap()); let mut x = Histogram::new(bounds.clone()).unwrap(); for i in lpyld { x.insert(i); } let mut y = Histogram::new(bounds).unwrap(); for i in rpyld { y.insert(i); } let mut res = x.clone(); res += y.clone(); assert_eq!(res.count(), x.count() + y.count()); if res.sum().is_some() { match (x.sum().is_some(), y.sum().is_some()) { (true, true) => { assert_eq!(res.sum().unwrap(), x.sum().unwrap() + y.sum().unwrap()); } (false, true) => { assert_eq!(res.sum().unwrap(), y.sum().unwrap()); } (true, false) => { assert_eq!(res.sum().unwrap(), x.sum().unwrap()); } (false, false) => unreachable!(), } } else { assert!(x.sum().is_none()); assert!(y.sum().is_none()); } let mut x_iter = x.iter(); let mut y_iter = y.iter(); for &(bound, count) in res.iter() { let next_x = x_iter.next().unwrap(); let next_y = y_iter.next().unwrap(); assert_eq!(bound, next_x.0); assert_eq!(bound, next_y.0); assert_eq!(count, next_x.1 + next_y.1) } TestResult::passed() } QuickCheck::new().quickcheck(inner as fn(Vec, Vec, Vec) -> TestResult); } macro_rules! generate_tests { ($m:ident, $t:ty) => { mod $m { use super::*; #[test] fn test_is_sorted() { fn inner(mut pyld: Vec<$t>) -> TestResult { pyld.sort_by(|a, b| a.partial_cmp(b).unwrap()); assert!(is_sorted(&pyld)); TestResult::passed() } QuickCheck::new().quickcheck(inner as fn(Vec<$t>) -> TestResult); } #[test] fn test_insertion_count() { fn inner(mut bounds: Vec<$t>, pyld: Vec<$t>) -> TestResult { if bounds.is_empty() { return TestResult::discard(); } bounds.sort_by(|a, b| a.partial_cmp(b).unwrap()); let mut histo = Histogram::new(bounds).unwrap(); let total = pyld.len(); for i in pyld.clone() { histo.insert(i); } // confirm that the histogram holds the correct number of items assert_eq!(total, histo.count()); TestResult::passed() } QuickCheck::new().quickcheck(inner as fn(Vec<$t>, Vec<$t>) -> TestResult); } #[test] fn test_insertion_sum() { fn inner(mut bounds: Vec<$t>, pyld: Vec<$t>) -> TestResult { if bounds.is_empty() { return TestResult::discard(); } bounds.sort_by(|a, b| a.partial_cmp(b).unwrap()); let mut histo = Histogram::new(bounds).unwrap(); let mut sum: $t = 0 as $t; for i in pyld.clone() { sum += i; histo.insert(i); } // confirm that the histogram holds the correct sum of items if pyld.is_empty() { assert_eq!(None, histo.sum()); } else { assert_eq!(Some(sum), histo.sum()); } TestResult::passed() } QuickCheck::new().quickcheck(inner as fn(Vec<$t>, Vec<$t>) -> TestResult); } #[test] fn test_insertion_below_count() { fn inner(mut bounds: Vec<$t>, mut pyld: Vec<$t>) -> TestResult { if bounds.is_empty() { return TestResult::discard(); } bounds.sort_by(|a, b| a.partial_cmp(b).unwrap()); let mut histo = Histogram::new(bounds.clone()).unwrap(); for i in pyld.clone() { histo.insert(i); } let mut bounds: Vec> = bounds.into_iter().map(|x| Bound::Finite(x)).collect(); bounds.push(Bound::PosInf); // confirm that the histogram has correctly binned by // asserting that for every bound the correct number of // payload items are below that upper bound pyld.sort_by(|a, b| a.partial_cmp(b).unwrap()); for b in bounds.iter() { let mut below_count = 0; for v in pyld.iter() { match b { &Bound::Finite(ref bnd) => { if v <= bnd { below_count += 1; } else { break; } } &Bound::PosInf => { below_count += 1; } } } assert_eq!(below_count, histo.total_below(*b)) } TestResult::passed() } QuickCheck::new().quickcheck(inner as fn(Vec<$t>, Vec<$t>) -> TestResult); } #[test] fn test_insertion_above_count() { fn inner(mut bounds: Vec<$t>, mut pyld: Vec<$t>) -> TestResult { if bounds.is_empty() { return TestResult::discard(); } bounds.sort_by(|a, b| a.partial_cmp(b).unwrap()); let mut histo = Histogram::new(bounds.clone()).unwrap(); for i in pyld.clone() { histo.insert(i); } let mut bounds: Vec> = bounds.into_iter().map(|x| Bound::Finite(x)).collect(); bounds.push(Bound::PosInf); // confirm that the histogram has correctly binned by // asserting that for every bound the correct number of // payload items are above that upper bound pyld.sort_by(|a, b| a.partial_cmp(b).unwrap()); for b in bounds.iter() { let mut above_count = 0; for v in pyld.iter() { match b { &Bound::Finite(ref bnd) => { if v > bnd { above_count += 1; } } &Bound::PosInf => {} } } assert_eq!(above_count, histo.total_above(*b)) } TestResult::passed() } QuickCheck::new().quickcheck(inner as fn(Vec<$t>, Vec<$t>) -> TestResult); } #[test] fn test_insertion_between_count() { fn inner(mut bounds: Vec<$t>, mut pyld: Vec<$t>) -> TestResult { if bounds.is_empty() { return TestResult::discard(); } bounds.sort_by(|a, b| a.partial_cmp(b).unwrap()); let mut histo = Histogram::new(bounds.clone()).unwrap(); for i in pyld.clone() { histo.insert(i); } let mut bounds: Vec> = bounds.into_iter().map(|x| Bound::Finite(x)).collect(); bounds.push(Bound::PosInf); // confirm that the histogram has correctly binned by // asserting that for every (lower, upper] bound the // correct number of payload items are recorded between // that bound pyld.sort_by(|a, b| a.partial_cmp(b).unwrap()); for lower_b in bounds.iter() { for upper_b in bounds.iter() { let mut between_count = 0; if lower_b < upper_b { for v in pyld.iter() { match (lower_b, upper_b) { (&Bound::Finite(ref lw_b), &Bound::Finite(ref up_b)) => { if v > lw_b && v <= up_b { between_count += 1; } } (&Bound::Finite(ref lw_b), &Bound::PosInf) => { if v > lw_b { between_count += 1; } } _ => {} } } } assert_eq!(between_count, histo.total_between(*lower_b, *upper_b)) } } TestResult::passed() } QuickCheck::new().quickcheck(inner as fn(Vec<$t>, Vec<$t>) -> TestResult); } } } } // Why no generation for u8? Please see note on Histogram. generate_tests!(u16, u16); generate_tests!(u32, u32); generate_tests!(i16, i16); generate_tests!(i32, i32); generate_tests!(f32, f32); generate_tests!(f64, f64); generate_tests!(u64, u64); generate_tests!(i64, i64); generate_tests!(usize, usize); generate_tests!(isize, isize); } quantiles-0.7.1/src/lib.rs010064400007660000024000000015021321256076300136520ustar0000000000000000//! This crate provides approximate quantiles over data streams in a moderate //! amount of memory. //! //! Order statistics is a rough business. Exact solutions are expensive in terms //! of memory and computation. Recent literature has advanced approximations but //! each have fundamental tradeoffs. This crate is intended to be a collection //! of approximate algorithms that provide guarantees around space consumption. #![deny(missing_docs, missing_debug_implementations, missing_copy_implementations, unsafe_code, unstable_features, unused_import_braces)] #[cfg(test)] #[macro_use] extern crate quickcheck; #[cfg(feature = "serde_support")] #[macro_use] extern crate serde_derive; #[cfg(feature = "serde_support")] extern crate serde; pub mod misra_gries; pub mod greenwald_khanna; pub mod ckms; pub mod histogram; quantiles-0.7.1/src/misra_gries.rs010064400007660000024000000106261316256632500154230ustar0000000000000000//! Misra-Gries calculates an ε-approximate frequency count for a stream of N //! elements. The output is the k most frequent elements. //! //! 1. the approximate count f'[e] is smaller than the true frequency f[e] of e, //! but by at most εN, i.e., (f[e] - εN) ≤ f'[e] ≤ f[e] //! 2. any element e with a frequency f[e] ≥ εN appears in the result set //! //! The error bound ε = 1/(k+1) where k is the number of counters used in the //! algorithm. //! When k = 1 i.e. a single counter, the algorithm is equivalent to the //! Boyer-Moore Majority algorithm. //! //! If you want to check for elements that appear at least εN times, you will //! want to perform a second pass to calculate the exact frequencies of the //! values in the result set which can be done in constant space. //! //! `@article{MISRA1982143, //! title = "Finding repeated elements", //! journal = "Science of Computer Programming", //! volume = "2", //! number = "2", //! pages = "143 - 152", //! year = "1982", //! issn = "0167-6423", //! doi = "http://dx.doi.org/10.1016/0167-6423(82)90012-0", //! url = "http://www.sciencedirect.com/science/article/pii/0167642382900120", //! author = "J. Misra and David Gries", //! }` //! //! # Examples //! //! ``` //! use quantiles::misra_gries::*; //! //! let k: usize = 3; //! let numbers: Vec = vec![1,3,2,1,3,4,3,1,2,1]; //! let counts = misra_gries(numbers.iter(), k); //! let bound = numbers.len() / (k+1); //! let in_range = |f_expected: usize, f_approx: usize| { //! f_approx <= f_expected && //! (bound >= f_expected || f_approx >= (f_expected - bound)) //! }; //! assert!(in_range(4usize, *counts.get(&1).unwrap())); //! assert!(in_range(2usize, *counts.get(&2).unwrap())); //! assert!(in_range(3usize, *counts.get(&3).unwrap())); //! ``` use std::collections::BTreeMap; use std::collections::btree_map::Entry; /// Calculates the `k` most frequent elements in the iterable /// stream of elements `stream` using an ε-approximate frequency count where ε /// = 1/(k+1) pub fn misra_gries(stream: I, k: usize) -> BTreeMap where I: IntoIterator, V: Ord + Clone, { let mut counters = BTreeMap::new(); for i in stream { let counters_len = counters.len(); let mut counted = false; match counters.entry(i.clone()) { Entry::Occupied(mut item) => { *item.get_mut() += 1; counted = true; } Entry::Vacant(slot) => if counters_len < k { slot.insert(1); counted = true; }, } if !counted { for c in counters.values_mut() { *c -= 1; } counters = counters.into_iter().filter(|&(_, v)| v != 0).collect(); } } counters } #[cfg(test)] mod test { use super::*; use std::collections::BTreeMap; /// Calculate exact element frequencies using O(n) space. pub fn exact_frequencies(stream: I) -> BTreeMap where I: IntoIterator, V: Ord + Clone, { let mut counts = BTreeMap::new(); for i in stream { *counts.entry(i.clone()).or_insert(0) += 1; } counts } #[test] fn test_exact_frequencies() { let numbers = vec![1, 2, 1, 3, 3, 1, 2, 4]; let counts = exact_frequencies(numbers.iter()); assert_eq!(*counts.get(&1).unwrap() as u32, 3); assert_eq!(*counts.get(&2).unwrap() as u32, 2); assert_eq!(*counts.get(&3).unwrap() as u32, 2); assert_eq!(*counts.get(&4).unwrap() as u32, 1); } quickcheck! { fn is_exact(xs: Vec) -> bool { exact_frequencies(xs.iter()) == misra_gries(xs.iter(), xs.len()) } fn is_approximate(xs: Vec) -> bool { //(f[e] − εN) ≤ f'[e] ≤ f[e] let exacts = exact_frequencies(xs.iter()); let n = xs.len(); for k in 1..n { let epsilon_n = n / (k+1); let approxes = misra_gries(xs.iter(), k); for (i, c) in approxes { let exact = *exacts.get(i).unwrap(); if c > exact { return false; } if epsilon_n < exact && c < (exact - epsilon_n) { return false; } } } true } } } quantiles-0.7.1/tests/ckms.rs010064400007660000024000000022021307325502400144060ustar0000000000000000mod integration { mod ckms { extern crate quantiles; use self::quantiles::ckms::CKMS; use std::fs::File; use std::io::Read; use std::path::PathBuf; use std::str::FromStr; #[test] fn test_run_afl_examples() { let mut resource = PathBuf::from(env!("CARGO_MANIFEST_DIR")); resource.push("resources/afl_crashes_20161215.txt"); let mut f = File::open(resource).expect("could not open resource file"); let mut buffer = String::new(); f.read_to_string(&mut buffer) .expect("could not read resource file"); for s in buffer.lines() { let pyld: Vec = s.split_whitespace() .map(|f| f64::from_str(f)) .filter(|f| f.is_ok()) .map(|f| f.unwrap()) .collect(); if pyld.len() >= 2 { let mut ckms = CKMS::new(pyld[0]); for f in &pyld[1..] { ckms.insert(*f) } } } } } }