Algorithm-Numerical-Sample-2010011201/0000755000076400007640000000000011323106101016723 5ustar abigailabigailAlgorithm-Numerical-Sample-2010011201/Makefile.PL0000644000076400007640000000343611301347730020716 0ustar abigailabigail#!/usr/bin/perl use 5.006; use strict; use warnings; no warnings 'syntax'; use ExtUtils::MakeMaker; my $PACKAGE = 'Algorithm::Numerical::Sample'; my $LIB_FILE = "lib/$PACKAGE.pm"; $LIB_FILE =~ s!::!/!g; my $LOCAL_ADDR = 'cpan'; my $DOMAIN = 'abigail.be'; my $REPO = $PACKAGE; $REPO =~ s!::!--!g; my $REPO_HOST = 'github.com'; my $ME = 'Abigail'; my $REPOSITORY = "git://$REPO_HOST/$ME/\L$REPO.git"; my %args = ( NAME => $PACKAGE, VERSION_FROM => $LIB_FILE, ABSTRACT_FROM => $LIB_FILE, PREREQ_PM => { 'strict' => 0, 'warnings' => 0, 'Exporter' => 0, }, MIN_PERL_VERSION => 5.006, AUTHOR => "$ME <$LOCAL_ADDR\@$DOMAIN>", LICENSE => 'mit', META_MERGE => { test_requires => { 'strict' => 0, 'warnings' => 0, 'Test::More' => 0.88, }, resources => { repository => $REPOSITORY, }, keywords => [qw [ ]], }, ); $args {META_MERGE} {build_requires} ||= { 'ExtUtils::MakeMaker' => 0, %{$args {META_MERGE} {test_requires}} }; $args {META_MERGE} {configure_requires} ||= $args {META_MERGE} {build_requires}; my %filter = ( MIN_PERL_VERSION => '6.48', LICENSE => '6.48', META_MERGE => '6.46', AUTHOR => '6.07', ABSTRACT_FROM => '6.07', ); delete $args {$_} for grep {defined $filter {$_} && $ExtUtils::MakeMaker::VERSION lt $filter {$_}} keys %args; WriteMakefile %args; __END__ Algorithm-Numerical-Sample-2010011201/MANIFEST0000644000076400007640000000032511323106102020055 0ustar abigailabigailChanges MANIFEST Makefile.PL lib/Algorithm/Numerical/Sample.pm t/000_basic.t t/100_historic.t t/950_pod.t t/960_pod_coverage.t README META.yml Module meta-data (added by MakeMaker) Algorithm-Numerical-Sample-2010011201/Changes0000644000076400007640000000076511323106032020231 0ustar abigailabigailVersion 2010011201 Tue Jan 12 15:49:54 CET 2010 + Disable Kwalitee tests Version 2009112001 + POD and Kwalitee tests. + Even moderner Makefile.PL. + Test::NoWarnings. Version 2009102701 + Modernized Makefile.PL. Version 2009040301 + Moved to git and github. Revision 1.3 1999/08/09 08:01:05 abigail + Changed *all* occurences of Algorithms to Algorithm. Revision 1.2 1999/03/01 21:06:07 abigail + Changed package to Algorithm::* Revision 1.1 1998/04/29 03:05:57 abigail + Initial revision Algorithm-Numerical-Sample-2010011201/t/0000755000076400007640000000000011323106101017166 5ustar abigailabigailAlgorithm-Numerical-Sample-2010011201/t/100_historic.t0000755000076400007640000000124106666600246021607 0ustar abigailabigail# Before `make install' is performed this script should be runnable with # `make test'. After `make install' it should work as `perl test.pl' ######################### We start with some black magic to print on failure. # Change 1..1 below to 1..last_test_to_print . # (It may become useful if the test is moved to ./t subdirectory.) BEGIN { $| = 1; print "1..1\n"; } END {print "not ok 1\n" unless $loaded;} use Algorithm::Numerical::Sample; $loaded = 1; print "ok 1\n"; ######################### End of black magic. # Insert your test code below (better if it prints "ok 13" # (correspondingly "not ok 13") depending on the success of chunk 13 # of the test code): Algorithm-Numerical-Sample-2010011201/t/950_pod.t0000755000076400007640000000032111301347017020542 0ustar abigailabigail#!/usr/bin/perl use Test::More; use strict; use warnings; no warnings 'syntax'; eval "use Test::Pod 1.00; 1" or plan skip_all => "Test::Pod required for testing POD"; all_pod_files_ok (); __END__ Algorithm-Numerical-Sample-2010011201/t/000_basic.t0000755000076400007640000000062511301350605021027 0ustar abigailabigail#!/usr/bin/perl use 5.006; use strict; use warnings; no warnings 'syntax'; use Test::More 0.88; our $r = eval "require Test::NoWarnings; 1"; BEGIN { use_ok ('Algorithm::Numerical::Sample') or BAIL_OUT ("Loading of 'Algorithm::Numerical::Sample' failed"); } ok defined $Algorithm::Numerical::Sample::VERSION, "VERSION is set"; Test::NoWarnings::had_no_warnings () if $r; done_testing; Algorithm-Numerical-Sample-2010011201/t/960_pod_coverage.t0000755000076400007640000000040611301347141022420 0ustar abigailabigail#!/usr/bin/perl use Test::More; use strict; use warnings; no warnings 'syntax'; eval "use Test::Pod::Coverage 1.00; 1" or plan skip_all => "Test::Pod::Coverage required for testing POD coverage"; all_pod_coverage_ok ({private => [qr /^/]}); __END__ Algorithm-Numerical-Sample-2010011201/META.yml0000644000076400007640000000167111323106101020201 0ustar abigailabigail--- #YAML:1.0 name: Algorithm-Numerical-Sample version: 2010011201 abstract: Draw samples from a set author: - Abigail license: mit distribution_type: module configure_requires: ExtUtils::MakeMaker: 0 strict: 0 Test::More: 0.88 warnings: 0 build_requires: ExtUtils::MakeMaker: 0 strict: 0 Test::More: 0.88 warnings: 0 requires: Exporter: 0 perl: 5.006 strict: 0 warnings: 0 resources: repository: git://github.com/Abigail/algorithm--numerical--sample.git no_index: directory: - t - inc generated_by: ExtUtils::MakeMaker version 6.56 meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 keywords: [] test_requires: strict: 0 Test::More: 0.88 warnings: 0 Algorithm-Numerical-Sample-2010011201/lib/0000755000076400007640000000000011323106101017471 5ustar abigailabigailAlgorithm-Numerical-Sample-2010011201/lib/Algorithm/0000755000076400007640000000000011323106101021417 5ustar abigailabigailAlgorithm-Numerical-Sample-2010011201/lib/Algorithm/Numerical/0000755000076400007640000000000011323106101023336 5ustar abigailabigailAlgorithm-Numerical-Sample-2010011201/lib/Algorithm/Numerical/Sample.pm0000644000076400007640000001704411323106004025125 0ustar abigailabigailpackage Algorithm::Numerical::Sample; use 5.006; use strict; use warnings; no warnings 'syntax'; use Exporter (); our @ISA = qw /Exporter/; our @EXPORT = qw //; our @EXPORT_OK = qw /sample/; our $VERSION = '2010011201'; my @PARAMS = qw /set sample_size/; sub sample { my %args = @_; # Deal with - parameters. foreach (@PARAMS) { $args {$_} = $args {"-$_"} unless defined $args {$_}; } # Check for set parameter. die "sample requires the set parameter" unless $args {set}; my $set = $args {set}; # Set sample and set size. my $sample_size = defined $args {sample_size} ? $args {sample_size} : 1; my $set_size = @$set; # Reservoir will be our sample. my @reservoir = (undef) x $sample_size; # Initialize counters. my $sample_counter = 0; my $set_counter = 0; # Loop as long as the reservoir isn't filled. while ($sample_counter < $sample_size) { # Draw a random number. my $U = rand ($set_size - $set_counter); if ($U < $sample_size - $sample_counter) { # Select the next element with probability # $sample_size - $sample_counter # ------------------------------ # $set_size - $set_counter $reservoir [$sample_counter ++] = $set -> [$set_counter]; } $set_counter ++; } wantarray ? @reservoir : \@reservoir; } package Algorithm::Numerical::Sample::Stream; use strict; sub new { my $proto = shift; my $class = ref $proto || $proto; my %args = @_; foreach (qw /sample_size/) { $args {$_} = $args {"-$_"} unless defined $args {$_}; } my $self = {}; $self -> {sample_size} = defined $args {sample_size} ? $args {sample_size} : 1; $self -> {seen} = 0; $self -> {reservoir} = [(undef) x $self -> {sample_size}]; bless $self, $class; } sub data { my $self = shift; foreach my $sample (@_) { if ($self -> {seen} < $self -> {sample_size}) { # Initialize reservoir. $self -> {reservoir} -> [$self -> {seen}] = [$self -> {seen}, $sample]; } else { # Draw number. my $U = int rand ($self -> {seen} + 1); if ($U < $self -> {sample_size}) { $self -> {reservoir} -> [$U] = [$self -> {seen}, $sample]; } } $self -> {seen} ++; } return; } sub extract { my $self = shift; my @result = map {$_ -> [1]} sort {$a -> [0] <=> $b -> [0]} @{$self -> {reservoir}}; $self -> {seen} = 0; $self -> {reservoir} = [(undef) x $self -> {sample_size}]; wantarray ? @result : $result [0]; } __END__ =head1 NAME Algorithm::Numerical::Sample - Draw samples from a set =head1 SYNOPSIS use Algorithm::Numerical::Sample qw /sample/; @sample = sample (-set => [1 .. 10000], -sample_size => 100); $sampler = Algorithm::Numerical::Sample::Stream -> new; while (<>) {$sampler -> data ($_)} $random_line = $sampler -> extract; =head1 DESCRIPTION This package gives two methods to draw fair, random samples from a set. There is a procedural interface for the case the entire set is known, and an object oriented interface when the a set with unknown size has to be processed. =head2 B: C ARRAYREF [,sample_size =E EXPR])> The C function takes a set and a sample size as arguments. If the sample size is omitted, a sample of C<1> is taken. The keywords C and C may be preceeded with an optional C<->. The function returns the sample list, or a reference to the sample list, depending on the context. =head2 B: C The class C has the following methods: =over =item C This function returns an object of the C class. It will take an optional argument of the form C EXPR>, where C evaluates to the sample size to be taken. If this argument is missing, a sample of size C<1> will be taken. The keyword C may be preceeded by an optional dash. =item C The method C takes a list of parameters which are elements of the set we are sampling. Any number of arguments can be given. =item C This method will extract the sample from the object, and reset it to a fresh state, such that a sample of the same size but from a different set, can be taken. C will return a list in list context, or the first element of the sample in scalar context. =back =head1 CORRECTNESS PROOFS =head2 Algorithm A. Crucial to see that the C algorithm is correct is the fact that when we sample C elements from a set of size C that the Cst element is choosen with probability C<(n - m)/(N - t)>, when already C elements have been choosen. We can immediately see that we will never pick too many elements (as the probability is 0 as soon as C), nor too few, as the probability will be 1 if we have C elements to choose from the remaining C elements, for some C. For the proof that the sampling is unbiased, we refer to [3]. (Section 3.4.2, Exercise 3). =head2 Algorithm B. It is easy to see that the second algorithm returns the correct number of elements. For a sample of size C, the first C elements go into the reservoir, and after that, the reservoir never grows or shrinks in size; elements only get replaced. A detailed proof of the fairness of the algorithm appears in [3]. (Section 3.4.2, Exercise 7). =head1 LITERATURE Both algorithms are discussed by Knuth [3] (Section 3.4.2). The first algoritm, I, was discovered by Fan, Muller and Rezucha [1], and independently by Jones [2]. The second algorithm, I, is due to Waterman. =head1 REFERENCES =over =item [1] C. T. Fan, M. E. Muller and I. Rezucha, I B<57> (1962), pp 387 - 402. =item [2] T. G. Jones, I B<5> (1962), pp 343. =item [3] D. E. Knuth: I, Volume 2, Third edition. Reading: Addison-Wesley, 1997. ISBN: 0-201-89684-2. =back =head1 DEVELOPMENT The current sources of this module are found on github, L<< git://github.com/Abigail/algorithm--numerical--sample.git >>. =head1 AUTHOR This package was written by Abigail, L<< cpan@abigail.be >>. =head1 COPYRIGHT and LICENSE Copyright (C) 1998, 1999, 2009, Abigail. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. =cut Algorithm-Numerical-Sample-2010011201/README0000644000076400007640000001155611165422217017630 0ustar abigailabigailNAME Algorithm::Numerical::Sample - Draw samples from a set SYNOPSIS use Algorithm::Numerical::Sample qw /sample/; @sample = sample (-set => [1 .. 10000], -sample_size => 100); $sampler = Algorithm::Numerical::Sample::Stream -> new; while (<>) {$sampler -> data ($_)} $random_line = $sampler -> extract; DESCRIPTION This package gives two methods to draw fair, random samples from a set. There is a procedural interface for the case the entire set is known, and an object oriented interface when the a set with unknown size has to be processed. A: "sample (set => ARRAYREF [,sample_size => EXPR])" The "sample" function takes a set and a sample size as arguments. If the sample size is omitted, a sample of 1 is taken. The keywords "set" and "sample_size" may be preceeded with an optional "-". The function returns the sample list, or a reference to the sample list, depending on the context. B: "Algorithm::Numerical::Sample::Stream" The class "Algorithm::Numerical::Sample::Stream" has the following methods: "new" This function returns an object of the "Algorithm::Numerical::Sample::Stream" class. It will take an optional argument of the form "sample_size => EXPR", where "EXPR" evaluates to the sample size to be taken. If this argument is missing, a sample of size 1 will be taken. The keyword "sample_size" may be preceeded by an optional dash. "data (LIST)" The method "data" takes a list of parameters which are elements of the set we are sampling. Any number of arguments can be given. "extract" This method will extract the sample from the object, and reset it to a fresh state, such that a sample of the same size but from a different set, can be taken. "extract" will return a list in list context, or the first element of the sample in scalar context. CORRECTNESS PROOFS Algorithm A. Crucial to see that the "sample" algorithm is correct is the fact that when we sample "n" elements from a set of size "N" that the "t + 1"st element is choosen with probability "(n - m)/(N - t)", when already "m" elements have been choosen. We can immediately see that we will never pick too many elements (as the probability is 0 as soon as "n == m"), nor too few, as the probability will be 1 if we have "k" elements to choose from the remaining "k" elements, for some "k". For the proof that the sampling is unbiased, we refer to [3]. (Section 3.4.2, Exercise 3). Algorithm B. It is easy to see that the second algorithm returns the correct number of elements. For a sample of size "n", the first "n" elements go into the reservoir, and after that, the reservoir never grows or shrinks in size; elements only get replaced. A detailed proof of the fairness of the algorithm appears in [3]. (Section 3.4.2, Exercise 7). LITERATURE Both algorithms are discussed by Knuth [3] (Section 3.4.2). The first algoritm, *Selection sampling technique*, was discovered by Fan, Muller and Rezucha [1], and independently by Jones [2]. The second algorithm, *Reservoir sampling*, is due to Waterman. REFERENCES [1] C. T. Fan, M. E. Muller and I. Rezucha, *J. Amer. Stat. Assoc.* 57 (1962), pp 387 - 402. [2] T. G. Jones, *CACM* 5 (1962), pp 343. [3] D. E. Knuth: *The Art of Computer Programming*, Volume 2, Third edition. Reading: Addison-Wesley, 1997. ISBN: 0-201-89684-2. DEVELOPMENT The current sources of this module are found on github, . AUTHOR This package was written by Abigail, cpan@abigail.be. COPYRIGHT and LICENSE Copyright (C) 1998, 1999, 2009, Abigail. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.