Statistics-TopK-0.02/000755 000765 000024 00000000000 12501730246 014545 5ustar00raygstaff000000 000000 Statistics-TopK-0.02/Changes000644 000765 000024 00000000275 12501730103 016034 0ustar00raygstaff000000 000000 Revision history for Statistics-TopK 0.02 Tue Mar 17 04:39:10 UTC 2015 - Fixed typo which broke counts(). (RT #102816) 0.01 Sat Sep 5 23:14:19 UTC 2009 - Initial distribution. Statistics-TopK-0.02/lib/000755 000765 000024 00000000000 12501730245 015312 5ustar00raygstaff000000 000000 Statistics-TopK-0.02/Makefile.PL000644 000765 000024 00000002235 12501727321 016521 0ustar00raygstaff000000 000000 use strict; use warnings; use ExtUtils::MakeMaker; my %conf = ( NAME => 'Statistics::TopK', AUTHOR => 'gray ', LICENSE => 'perl', VERSION_FROM => 'lib/Statistics/TopK.pm', ABSTRACT_FROM => 'lib/Statistics/TopK.pm', BUILD_REQUIRES => { 'Test::More' => 0.98, }, META_MERGE => { resources => { repository => 'http://github.com/gray/statistics-topk', }, }, dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', }, clean => { FILES => 'Statistics-TopK-*' }, ); my $eumm_version = do { no warnings 'numeric'; eval $ExtUtils::MakeMaker::VERSION; }; delete $conf{META_MERGE} if $eumm_version < 6.46; $conf{PREREQ_PM} = { %{ $conf{PREREQ_PM} || {} }, %{ delete $conf{BUILD_REQUIRES} }, } if ($conf{BUILD_REQUIRES} and $eumm_version < 6.5503); WriteMakefile(%conf); sub MY::postamble { return <<" MAKE_FRAG"; authortest: \t\$(MAKE) -e \$(TEST_TYPE) TEST_FILES="xt/*.t" MAKE_FRAG } sub MY::dist_test { my $self = shift; return $self->MM::dist_test . <<" MAKE_FRAG"; \tcd \$(DISTVNAME) && \$(MAKE) authortest \$(PASTHRU) MAKE_FRAG } Statistics-TopK-0.02/MANIFEST000644 000765 000024 00000000617 12501730246 015702 0ustar00raygstaff000000 000000 Changes lib/Statistics/TopK.pm Makefile.PL MANIFEST This list of files README t/00_compile.t t/01_new.t t/02_methods.t xt/kwalitee.t xt/perlcritic.t xt/perlcriticrc xt/pod.t xt/pod_coverage.t xt/portability_filenames.t xt/vars.t META.yml Module YAML meta-data (added by MakeMaker) META.json Module JSON meta-data (added by MakeMaker) Statistics-TopK-0.02/META.json000644 000765 000024 00000001671 12501730246 016173 0ustar00raygstaff000000 000000 { "abstract" : "Implementation of the top-k streaming algorithm", "author" : [ "gray " ], "dynamic_config" : 1, "generated_by" : "ExtUtils::MakeMaker version 7.04, CPAN::Meta::Converter version 2.150001", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : "2" }, "name" : "Statistics-TopK", "no_index" : { "directory" : [ "t", "inc" ] }, "prereqs" : { "build" : { "requires" : { "Test::More" : "0.98" } }, "configure" : { "requires" : { "ExtUtils::MakeMaker" : "0" } }, "runtime" : { "requires" : {} } }, "release_status" : "stable", "resources" : { "repository" : { "url" : "http://github.com/gray/statistics-topk" } }, "version" : "0.02" } Statistics-TopK-0.02/META.yml000644 000765 000024 00000001047 12501730246 016020 0ustar00raygstaff000000 000000 --- abstract: 'Implementation of the top-k streaming algorithm' author: - 'gray ' build_requires: Test::More: '0.98' configure_requires: ExtUtils::MakeMaker: '0' dynamic_config: 1 generated_by: 'ExtUtils::MakeMaker version 7.04, CPAN::Meta::Converter version 2.150001' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: Statistics-TopK no_index: directory: - t - inc requires: {} resources: repository: http://github.com/gray/statistics-topk version: '0.02' Statistics-TopK-0.02/README000644 000765 000024 00000000733 12501727717 015441 0ustar00raygstaff000000 000000 Statistics-TopK ============== This module implements the top-K streaming algorithm. INSTALLATION To install this module type the following: perl Makefile.PL make make test make install DEPENDENCIES This module requires these other modules and libraries: Test::More COPYRIGHT AND LICENCE Copyright (C) 2009-2015 by gray This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. Statistics-TopK-0.02/t/000755 000765 000024 00000000000 12501730245 015007 5ustar00raygstaff000000 000000 Statistics-TopK-0.02/xt/000755 000765 000024 00000000000 12501730245 015177 5ustar00raygstaff000000 000000 Statistics-TopK-0.02/xt/kwalitee.t000644 000765 000024 00000000262 12501727321 017172 0ustar00raygstaff000000 000000 use strict; use warnings; use Test::More; eval { require Test::Kwalitee; Test::Kwalitee->import(); 1 } or do { plan skip_all => 'Test::Kwalitee not installed; skipping'; }; Statistics-TopK-0.02/xt/perlcritic.t000644 000765 000024 00000000372 12501727321 017527 0ustar00raygstaff000000 000000 use strict; use warnings; use Test::More; eval { require Test::Perl::Critic; 1 } or do { plan skip_all => "Test::Perl::Critic is not installed."; }; Test::Perl::Critic->import( -profile => 'xt/perlcriticrc' ); all_critic_ok(qw( ex lib t xt )); Statistics-TopK-0.02/xt/perlcriticrc000644 000765 000024 00000000727 12501726304 017616 0ustar00raygstaff000000 000000 verbose = 8 [CodeLayout::ProhibitHardTabs] allow_leading_tabs = 0 severity = 5 [CodeLayout::ProhibitTrailingWhitespace] severity = 5 # requires Perl::Critic::More [CodeLayout::RequireASCII] severity = 5 [TestingAndDebugging::RequireUseWarnings] severity = 5 [-TestingAndDebugging::ProhibitNoStrict] [-BuiltinFunctions::ProhibitStringyEval] [-Modules::RequireFilenameMatchesPackage] # requires Regexp::Parser [RegularExpressions::ProhibitUnusedCapture] severity = 5 Statistics-TopK-0.02/xt/pod.t000644 000765 000024 00000000250 12501727321 016144 0ustar00raygstaff000000 000000 use strict; use warnings; use Test::More; eval "use Test::Pod 1.00; 1" or do { plan skip_all => 'Test::Pod 1.00 required for testing POD'; }; all_pod_files_ok(); Statistics-TopK-0.02/xt/pod_coverage.t000644 000765 000024 00000000270 12501727321 020021 0ustar00raygstaff000000 000000 use strict; use warnings; use Test::More; eval "use Test::Pod::Coverage 1.00; 1" or do { plan skip_all => 'Test::Pod::Coverage 1.00 is not installed.'; }; all_pod_coverage_ok(); Statistics-TopK-0.02/xt/portability_filenames.t000644 000765 000024 00000000256 12501726304 021755 0ustar00raygstaff000000 000000 use strict; use warnings; use Test::More; eval "use Test::Portability::Files; 1" or do { plan skip_all => 'Test::Portability::Files is not installed.'; }; run_tests(); Statistics-TopK-0.02/xt/vars.t000644 000765 000024 00000000224 12501726304 016336 0ustar00raygstaff000000 000000 use strict; use warnings; use Test::More; eval "use Test::Vars; 1" or do { plan skip_all => 'Test::Vars is not installed.'; }; all_vars_ok(); Statistics-TopK-0.02/t/00_compile.t000644 000765 000024 00000000136 12501727314 017126 0ustar00raygstaff000000 000000 use strict; use warnings; use Test::More; BEGIN { use_ok 'Statistics::TopK' } done_testing; Statistics-TopK-0.02/t/01_new.t000644 000765 000024 00000000330 12501727314 016264 0ustar00raygstaff000000 000000 use strict; use warnings; use Test::More; use Statistics::TopK; my $counter = Statistics::TopK->new(10); isa_ok($counter, 'Statistics::TopK', 'new'); can_ok('Statistics::TopK', qw( add top counts )); done_testing; Statistics-TopK-0.02/t/02_methods.t000644 000765 000024 00000006342 12501727314 017150 0ustar00raygstaff000000 000000 use strict; use warnings; use Test::More; use Statistics::TopK; { my $counter = Statistics::TopK->new(10); is_deeply([$counter->top], [], 'stream containing no elements'); } { my $counter = Statistics::TopK->new(10); for my $elem ( ('a') x 100 ) { $counter->add($elem); } is_deeply( [$counter->top], ['a'], 'stream containing one distinct element' ); is_deeply( [ $counter->counts ], [ a => 100 ], 'counts for one stream with one distinct element' ); } { my $counter = Statistics::TopK->new(10); for my $elem (1 .. 100) { $counter->add($elem); } is_deeply( [ sort { $a <=> $b } $counter->top ], [91 .. 100], 'stream containing all distinct elements' ); } { my $counter = Statistics::TopK->new(4); my @stream = qw( 1 2 1 1 1 1 1 1 9 1 1 3 1 1 3 4 1 5 1 4 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 2 9 1 1 1 1 2 1 4 1 6 1 3 1 4 1 1 1 7 2 5 1 1 1 4 1 1 1 4 1 1 3 9 8 1 7 1 2 1 1 1 1 1 1 8 1 1 1 1 1 2 1 1 1 1 2 4 9 5 1 1 1 1 2 1 3 1 1 1 1 5 2 3 2 1 1 5 3 2 1 7 1 6 1 1 1 1 2 1 2 1 3 1 1 1 2 4 1 3 3 6 6 1 1 3 4 1 1 1 2 1 3 1 1 6 1 1 1 1 2 8 3 1 2 1 1 1 1 1 6 1 6 1 1 6 7 2 2 2 1 1 1 1 4 1 5 1 4 1 2 1 1 3 1 1 1 1 2 5 1 1 1 2 2 6 1 1 1 1 9 1 1 1 2 1 1 2 2 2 5 2 1 1 1 2 2 3 6 2 5 4 1 1 8 1 1 10 2 1 1 1 1 1 2 2 1 6 1 2 1 1 1 7 6 6 3 3 1 1 5 1 1 1 1 1 2 4 1 1 1 5 2 2 1 1 8 1 2 9 3 1 1 1 1 2 3 1 1 3 1 1 1 5 1 1 1 1 1 5 1 4 2 1 1 2 3 1 3 1 1 1 1 1 3 10 1 3 1 2 1 2 1 1 3 3 1 1 1 1 1 9 1 1 2 2 1 1 5 1 1 3 3 2 1 1 1 5 1 1 4 1 3 1 2 1 4 4 1 1 1 5 1 1 2 2 1 5 1 1 1 1 1 5 6 1 1 2 1 1 1 3 4 1 1 1 8 1 3 1 9 1 1 2 1 1 1 1 7 1 1 1 1 1 1 5 2 1 4 1 6 1 2 1 1 1 3 1 4 2 1 1 1 2 5 1 1 1 2 1 1 1 3 1 1 1 1 2 1 4 2 3 2 1 8 4 1 2 1 1 1 1 1 1 1 1 1 1 3 1 1 2 1 1 1 1 1 2 2 1 1 1 3 1 2 2 4 2 2 1 1 5 1 4 1 1 2 1 1 1 3 2 1 1 7 1 3 1 2 3 4 1 1 2 1 1 3 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 1 2 1 2 1 1 3 1 1 2 1 1 1 1 1 1 6 1 1 5 1 2 1 2 1 1 1 2 1 9 1 1 4 1 1 1 6 1 1 1 2 2 1 1 1 2 1 9 1 3 1 1 2 1 2 1 1 5 3 1 4 1 1 1 1 1 1 1 1 2 4 1 1 1 2 1 2 1 1 1 1 1 4 4 2 4 1 1 7 1 2 1 3 8 1 1 1 1 1 1 7 1 4 1 3 1 1 4 3 2 7 2 1 1 10 1 1 2 4 1 1 2 1 1 1 1 1 1 7 1 2 8 7 2 1 2 7 3 1 1 2 1 3 1 3 1 1 1 1 3 1 1 1 1 1 3 3 1 9 2 1 2 1 3 1 1 1 3 3 1 2 1 4 1 2 5 1 4 5 1 2 3 2 3 1 1 1 1 1 1 1 1 2 1 4 4 3 1 1 4 1 3 1 6 3 1 1 1 1 2 1 2 6 1 3 8 1 1 1 2 1 1 1 1 10 3 3 1 1 1 2 1 1 2 1 1 2 1 1 1 2 1 3 3 2 1 2 3 1 1 1 1 3 1 2 1 1 4 4 5 1 1 3 2 1 1 7 6 2 1 1 2 7 1 5 1 1 1 1 1 1 1 4 1 1 1 1 1 8 4 1 1 1 4 1 1 1 2 1 1 1 6 1 10 7 1 2 1 1 1 2 3 3 1 3 1 4 2 1 2 2 3 1 1 1 2 1 1 2 9 1 9 1 1 1 2 2 2 4 4 1 1 10 1 1 2 6 1 2 1 1 1 4 1 1 10 1 1 1 2 1 1 2 2 1 1 2 10 2 7 2 3 1 1 1 5 1 1 1 3 7 2 1 1 1 4 4 4 1 1 5 1 1 1 9 4 1 1 1 9 3 1 1 1 4 3 2 2 7 1 1 1 3 1 3 8 1 1 3 5 4 1 1 1 1 1 1 1 2 1 1 5 2 1 4 1 1 1 2 2 1 4 1 1 1 1 1 3 4 10 2 1 1 1 1 2 1 1 1 1 2 1 2 2 1 1 1 1 2 1 2 9 1 1 1 1 1 3 2 10 1 1 1 ); $counter->add($_) for @stream; is_deeply( [ sort { $a <=> $b } $counter->top ], [1 .. 4], 'stream with non-uniform distribution' ); } done_testing; Statistics-TopK-0.02/lib/Statistics/000755 000765 000024 00000000000 12501730245 017444 5ustar00raygstaff000000 000000 Statistics-TopK-0.02/lib/Statistics/TopK.pm000644 000765 000024 00000011254 12501727751 020672 0ustar00raygstaff000000 000000 package Statistics::TopK; use strict; use warnings; use Carp qw(croak); our $VERSION = '0.02'; $VERSION = eval $VERSION; use constant _K => 0; use constant _COUNTS => 1; use constant _ELEMS => 2; use constant _SIZE => 3; use constant _INDEX => 4; sub new { my ($class, $k) = @_; croak 'expecting a positive integer' unless defined $k and $k =~ /^\d+$/ and $k > 0; my $self = [ $k, # _K {}, # _COUNTS [], # _ELEMS 0, # _SIZE 0, # _INDEX ]; # Pre-extend the internal data structures, just in case $k is large. keys %{ $self->[_COUNTS] } = $k; $#{ $self->[_ELEMS] } = $k - 1; return bless $self, $class; } sub add { my ($self, $elem) = @_; # Increment the element's counter if it is currently being counted. if (exists $self->[_COUNTS]{$elem}) { return $self->[_COUNTS]{$elem} += 1; } # Add the element if it's not being counted and there are free slots. if ($self->[_SIZE] < $self->[_K]) { $self->[_ELEMS][ $self->[_SIZE]++ ] = $elem; return $self->[_COUNTS]{$elem} = 1; } # Decrement one of the currently counted elements. my $index = $self->[_INDEX]; my $prev = $self->[_ELEMS][$index]; my $count = $self->[_COUNTS]{$prev} -= 1; # Advance the counter. $self->[_INDEX] = ++$self->[_INDEX] % $self->[_K]; # If the count of the decremented element reaches 0, replace it with the # current element. if (0 == $count) { delete $self->[_COUNTS]{$prev}; $self->[_ELEMS][$index] = $elem; return $self->[_COUNTS]{$elem} = 1; } # This element is not currently being counted. return 0; } sub top { return keys %{$_[0]->[_COUNTS]}; } sub counts { return %{$_[0]->[_COUNTS]}; } 1; __END__ =head1 NAME Statistics::TopK - Implementation of the top-k streaming algorithm =head1 SYNOPSIS use Statistics::TopK; my $counter = Statistics::TopK->new(10); while (my $val = ) { chomp $val; $counter->add($val); } my @top = $counter->top; my %counts = $counter->counts; =head1 DESCRIPTION The C module implements the top-k streaming algorithm, also know as the "heavy hitters" algorithm. It is designed to process data streams and probabilistally calculate the C most frequent items while using limited memory. A typical example would be to determine the top 10 IP addresses listed in an access log. A simple solution would be to hash each IP address to a counter and then sort the resulting hash by the counter size. But the hash could theoretically require over 4 billion keys. The top-k algorithm only requires storage space proportional to the number of items of interest. It accomplishes this by sacrificing precision, as it is only a probabilistic counter. =head1 METHODS =head2 new $counter = Statistics::TopK->new($k) Creates a new C object which is prepared to count the top C<$k> elements. =head2 add $count = $counter->add($element) Count the given C<$element> and return its approximate count (if any) in the C object. Note that adding an element does not guarantee it will be counted yet, as the algorithm is probabilistic, and the occurrence of the current element might only be used decrease the count of one of the current top elements. =head2 top @top = $counter->top() Returns a list of the top-k counted elements. =head2 counts %counts = $counter->counts() Returns a hash of the top-k counted elements and their counts. =head1 SEE ALSO http://en.wikipedia.org/wiki/Streaming_algorithm#Heavy_hitters =head1 REQUESTS AND BUGS Please report any bugs or feature requests to L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc Statistics::TopK You can also look for information at: =over =item * GitHub Source Repository L =item * AnnoCPAN: Annotated CPAN documentation L =item * CPAN Ratings L =item * RT: CPAN's request tracker L =item * Search CPAN L =back =head1 COPYRIGHT AND LICENSE Copyright (C) 2009-2015 gray , all rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 AUTHOR gray, =cut