Statistics-Test-Sequence-0.01/0000755000175000017500000000000010547464515014744 5ustar tseetseeStatistics-Test-Sequence-0.01/t/0000755000175000017500000000000010547464515015207 5ustar tseetseeStatistics-Test-Sequence-0.01/t/20tests.t0000644000175000017500000000126510547462626016705 0ustar tseetseeuse strict; use warnings; use Test::More tests => 14; BEGIN { use_ok('Statistics::Test::Sequence') }; my $t = Statistics::Test::Sequence->new(); isa_ok($t, 'Statistics::Test::Sequence'); eval { $t->set_data( [map rand(), 1..10000] ); }; ok(!$@); ok(ref($t->{data}) eq 'ARRAY'); ok(@{$t->{data}} == 10000); my ($res, $bins, $exp); eval { ($res, $bins, $exp) = $t->test(); }; ok(!$@); ok(defined $res); ok(ref($bins) eq 'ARRAY'); ok(ref($exp) eq 'ARRAY'); eval { $t->set_data( sub { map rand(), 1..100 }, 100 ); }; ok(!$@); eval { ($res, $bins, $exp) = $t->test(); }; ok(!$@); ok(defined $res); ok(ref($bins) eq 'ARRAY'); ok(ref($exp) eq 'ARRAY'); Statistics-Test-Sequence-0.01/t/00podcover.t0000644000175000017500000000024210547430152017341 0ustar tseetseeuse Test::More; eval "use Test::Pod::Coverage 1.00"; plan skip_all => "Test::Pod::Coverage 1.00 required for testing POD coverage" if $@; all_pod_coverage_ok(); Statistics-Test-Sequence-0.01/t/12expfreq.t0000644000175000017500000000123010547463751017206 0ustar tseetseeuse strict; use warnings; use Test::More tests => 1+20; BEGIN { use_ok('Statistics::Test::Sequence') }; use constant EPS => 1e-9; foreach (1..10) { my $n = 10000*$_**2; my $f = Statistics::Test::Sequence::expected_frequency(1, $n); my $exp = ($n*5+1)/12; ok(about_equal($exp, $f), "Expect frequency for n=$n, k=1 ($exp == $f)"); } foreach (1..10) { my $n = 10005*$_**2; my $f = Statistics::Test::Sequence::expected_frequency(2, $n); my $exp = ($n*11-14)/60; ok(about_equal($exp, $f), "Expect frequency for n=$n, k=2 ($exp == $f)"); } sub about_equal { return 1 if $_[0] + EPS > $_[1] and $_[0] - EPS < $_[1]; return 0; } Statistics-Test-Sequence-0.01/t/00pod.t0000644000175000017500000000020210547430152016276 0ustar tseetseeuse Test::More; eval "use Test::Pod 1.00"; plan skip_all => "Test::Pod 1.00 required for testing POD" if $@; all_pod_files_ok(); Statistics-Test-Sequence-0.01/t/11faculty.t0000644000175000017500000000102710547463203017176 0ustar tseetseeuse strict; use warnings; use Test::More tests => 1+2*20; BEGIN { use_ok('Statistics::Test::Sequence') }; my @fac = qw( 1 2 6 24 120 720 5040 40320 362880 3628800 39916800 479001600 6227020800 87178291200 1307674368000 20922789888000 355687428096000 6402373705728000 121645100408832000 2432902008176640000 ); foreach (1..2) { my $msg = ''; $msg = ' ... memoized' if $_ ==2; foreach (1..@fac) { is( $fac[$_-1], Statistics::Test::Sequence::faculty($_), "faculty($_)$msg" ); } } Statistics-Test-Sequence-0.01/Changes0000644000175000017500000000016110547464323016232 0ustar tseetseeRevision history for Perl extension Statistics::Test::Sequence 0.01 Fri Jan 5 15:03 2007 - original version Statistics-Test-Sequence-0.01/examples/0000755000175000017500000000000010547464515016562 5ustar tseetseeStatistics-Test-Sequence-0.01/examples/comparison.pl0000644000175000017500000000154010547461057021267 0ustar tseetsee#!/usr/bin/perl use strict; use warnings; use lib 'lib'; use Statistics::Test::Sequence; use Data::Dumper; use Math::Random::MT; my $t = Statistics::Test::Sequence->new(); my $rnd; open my $fh, '<', '/dev/random' or die $!; read($fh, $rnd, 32); $rnd = unpack('%L', $rnd); my $gen = Math::Random::MT->new($rnd); { my $x = 4711; my $a = 421; my $c = 64773; my $m = 259200; sub lin_kong { $x = ($a*$x + $c) % $m; return $x; } } my $num = 10000000; foreach ( [ 'rand', sub {map rand(), 1..10000}, $num/10000 ], [ 'MT', sub {map $gen->rand(), 1..10000}, $num/10000 ], [ 'lin', \&lin_kong, $num ], ) { my $name = shift @$_; $t->set_data(@$_); print "Testing $name...\n"; my ($resid, $bins, $expected) = $t->test(); print Dumper $resid; print Dumper $bins; print Dumper $expected; } Statistics-Test-Sequence-0.01/MANIFEST0000644000175000017500000000040010547464515016067 0ustar tseetseeChanges examples/comparison.pl lib/Statistics/Test/Sequence.pm Makefile.PL MANIFEST This list of files README t/00pod.t t/00podcover.t t/11faculty.t t/12expfreq.t t/20tests.t META.yml Module meta-data (added by MakeMaker) Statistics-Test-Sequence-0.01/lib/0000755000175000017500000000000010547464515015512 5ustar tseetseeStatistics-Test-Sequence-0.01/lib/Statistics/0000755000175000017500000000000010547464515017644 5ustar tseetseeStatistics-Test-Sequence-0.01/lib/Statistics/Test/0000755000175000017500000000000010547464515020563 5ustar tseetseeStatistics-Test-Sequence-0.01/lib/Statistics/Test/Sequence.pm0000644000175000017500000002144010547464434022672 0ustar tseetseepackage Statistics::Test::Sequence; use 5.006; use strict; use warnings; our $VERSION = '0.01'; use Carp qw/croak/; use Params::Util qw/_POSINT _ARRAY _CODE/; use Math::BigFloat; use Memoize; =head1 NAME Statistics::Test::Sequence - Sequence correlation test for random numbers =head1 SYNOPSIS use Statistics::Test::Sequence; my $tester = Statistics::Test::Sequence->new(); $tester->set_data( [map {rand()} 1..1000000] ); my ($metric, $actual_freq, $expected_freq) = $tester->test(); use Data::Dumper; print "$metric\n"; print "Frequencies:\n"; print Dumper $actual_freq; print "Expected frequencies:\n"; print Dumper $expected_freq; =head1 DESCRIPTION This module implements a sequence correlation test for random number generators. It shows pairwise correlation between subsequent random numbers. The algorithm is as follows: (Following Blobel. Citation in SEE ALSO section.) =over 2 =item * Given C random numbers C. =item * For all C, compare C with C. If C is greater then C, assign a 0-Bit to the number. Otherwise, assign a 1-Bit. =item * Find all sequences of equal Bits. For every sequence, increment a counter for the length C of that sequence. (Regardless of whether it's a sequence of 1's or 0's.) =item * For uncorrelated random numbers, the number of sequences C of length C in the set of C random numbers is expected to be: N(k) = 2*((k^2+3*k+1)*N - (k^3+3*k^2-k-4)) / (k+3)! =back =head1 METHODS =cut =head2 new Creates a new random number tester. =cut sub new { my $proto = shift; my $class = ref($proto)||$proto; my $self = { data => undef, }; bless $self => $class; return $self; } =head2 set_data Sets the random numbers to operate on. First argument must be either an array reference to an array of random numbers or a code reference. If the first argument is a code reference, the second argument must be an integer C. The code reference is called C-times and its return values are used as random numbers. The code reference semantics are particularily useful if you do not want to store all random numbers in memory at the same time. You can write a subroutine that, for example, generates and returns batches of 100 random numbers so no more than 101 of these numbers will be in memory at the same time. Note that if you return 100 numbers at once and pass in C, you will have a sequence of 5000 random numbers. =cut sub set_data { my $self = shift; my $data = shift; if (_ARRAY($data)) { $self->{data} = $data; return 1; } elsif (_CODE($data)) { $self->{data} = $data; my $n = shift; if (not _POSINT($n)) { croak("'set_data' needs an integer as second argument if the first argument is a code reference."); } $self->{n} = $n; return 1; } else { croak("Invalid arguments to 'set_data'."); } } =head2 test Runs the sequence test on the data that was previously set using C. Returns three items: The first is the root mean square of the bin residuals divided by the number of random numbers. It I be used as a measure for the quality of the random number generator and should be as close to zero as possible. A better metric is to compare the following two return values. The second return value is a reference to the array of frequencies. An example is in order here. Generating one million random numbers, I get: [0, 416765, 181078, 56318, 11486, 1056, 150] This means there were no sequences of length 0 (obvious), 416765 sequences of length 1, etc. There were no sequences of length 7 or greater. This example is a bad random number generator! (It's a linear congruent generator with C<(a*x_i+c)%m> and C, C, C, and C). The third return value is similar in nature to the second in that it is a reference to an array containing sequence length frequencies. This one, however, contains the frequencies that would be expected for the given number of random numbers, were they uncorrelated. The number of bins has the maximum length of an occurring sequence as an upper limit. In the given example, you would get: (Dumped with Data::Dumper) $VAR1 = [ '0', '416666.75', '183333.1', '52777.64722222222222222222222222222222222', '11507.89523809523809523809523809523809524', '2033.72068452380952380952380952380952381', '303.1287808641975308641975308641975308642', # ... ]; Note that where I put in a C<# ...>, you would really see a couple more lines of numbers until the numbers go below an expected frequency of C<0.1>. For C and C, you get about 39 sequences, C is expected to be found 4-5 times, etc. =cut sub test { my $self = shift; my $data = $self->{data}; if (not defined $data) { croak("Set data using 'set_data' first."); } # bin counters my @bins; # current sequence type (> or <) my $current = undef; # current sequence length my $length = 0; # total number of random numbers my $numbers; if (_ARRAY($data)) { $current = ($data->[0] <=> $data->[1]) || 1; $length++; $numbers = @$data; foreach my $i (1 .. $#$data-1) { my $cmp = ($data->[$i] <=> $data->[$i+1]) || 1; if ($current == $cmp) { $length++; } else { $current = $cmp; $bins[$length]++; $length = 1; } } $bins[$length]++; } else { # CODE my @cache; my $calls = $self->{n}; my $first_run = 1; foreach (1..$calls) { # get new data push @cache, $data->(); # first run => initialize with first comparison if ($first_run and @cache > 1) { $current = ($cache[0] <=> $cache[1]) || 1; shift @cache; $length++; # == 1 $numbers++; # == 1 $first_run = 0; } while (@cache > 1) { $numbers++; my $this = shift @cache; my $cmp = ($this <=> $cache[0]) || 1; if ($current == $cmp) { $length++; } else { $current = $cmp; $bins[$length]++; $length = 1; } } } $bins[$length]++; } my @expected = (0); # 0-bin is empty foreach my $bin (1..$#bins) { $expected[$bin] = expected_frequency($bin, $numbers-1); } my $last_bin = $#bins; while ($expected[$last_bin] > 0.1) { $last_bin++; $expected[$last_bin] = expected_frequency($last_bin, $numbers-1); } foreach my $bin (0..$last_bin) { $bins[$bin] = 0 if not defined $bins[$bin]; } my @diff = map { abs($bins[$_] - $expected[$_]) } 0..$#bins; my $residual = 0; $residual += $_**2 for @diff; $residual = sqrt($residual); $residual = "$residual"; # de-bigfloatize @expected = map {"$_"} @expected; # de-bigfloatize return( $residual / ($numbers-1), \@bins, \@expected, ); } =head1 SUBROUTINES =head2 expected_frequency Returns the expected frequency of the sequence length C in a set of C random numbers assuming uncorrelated random numbers. Returns this as a L. Expects C and C as arguments. This subroutine is memoized. (See L.) =cut memoize('expected_frequency'); sub expected_frequency { my $k = Math::BigFloat->new(shift); my $n = Math::BigFloat->new(shift); return( 2 * ( ($k**2 + 3*$k + 1)*$n - ($k**3 + 3*$k**2 - $k - 4) ) / faculty($k+3) ); } =head2 faculty Computes the factulty of the first argument recursively as a L. This subroutine is memoized. (See L.) =cut memoize('faculty'); sub faculty { my $n = shift; return Math::BigFloat->bone() if $n <= 1; return $n * faculty($n-1); } 1; __END__ =head1 SEE ALSO L, L, L Random number generators: L, L, L, L, C where available The algorithm was taken from: (German) Blobel, V., and Lohrmann, E. I. Stuttgart, Leipzig: Teubner, 1998 =head1 AUTHOR Steffen Mueller, Esmueller@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2007 by Steffen Mueller This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.6 or, at your option, any later version of Perl 5 you may have available. =cut Statistics-Test-Sequence-0.01/Makefile.PL0000644000175000017500000000126710547464421016720 0ustar tseetseeuse 5.006; use ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( NAME => 'Statistics::Test::Sequence', VERSION_FROM => 'lib/Statistics/Test/Sequence.pm', # finds $VERSION LICENSE => 'perl', PREREQ_PM => { Params::Util => '0', Math::BigFloat => '0', Memoize => '0', }, # e.g., Module::Name => 1.1 ($] >= 5.005 ? ## Add these new keywords supported since 5.005 (ABSTRACT_FROM => 'lib/Statistics/Test/Sequence.pm', # retrieve abstract from module AUTHOR => 'Steffen Mueller ') : ()), ); Statistics-Test-Sequence-0.01/README0000644000175000017500000001231210547464334015622 0ustar tseetseeNAME Statistics::Test::Sequence - Sequence correlation test for random numbers SYNOPSIS use Statistics::Test::Sequence; my $tester = Statistics::Test::Sequence->new(); $tester->set_data( [map {rand()} 1..1000000] ); my ($metric, $actual_freq, $expected_freq) = $tester->test(); use Data::Dumper; print "$metric\n"; print "Frequencies:\n"; print Dumper $actual_freq; print "Expected frequencies:\n"; print Dumper $expected_freq; DESCRIPTION This module implements a sequence correlation test for random number generators. It shows pairwise correlation between subsequent random numbers. The algorithm is as follows: (Following Blobel. Citation in SEE ALSO section.) * Given "N+1" random numbers "u_j". * For all "j", compare "u_j" with "u_j+1". If "u_j" is greater then "u_j+1", assign a 0-Bit to the number. Otherwise, assign a 1-Bit. * Find all sequences of equal Bits. For every sequence, increment a counter for the length "k" of that sequence. (Regardless of whether it's a sequence of 1's or 0's.) * For uncorrelated random numbers, the number of sequences N(k) of length "k" in the set of "N+1" random numbers is expected to be: N(k) = 2*((k^2+3*k+1)*N - (k^3+3*k^2-k-4)) / (k+3)! METHODS new Creates a new random number tester. set_data Sets the random numbers to operate on. First argument must be either an array reference to an array of random numbers or a code reference. If the first argument is a code reference, the second argument must be an integer "n". The code reference is called "n"-times and its return values are used as random numbers. The code reference semantics are particularily useful if you do not want to store all random numbers in memory at the same time. You can write a subroutine that, for example, generates and returns batches of 100 random numbers so no more than 101 of these numbers will be in memory at the same time. Note that if you return 100 numbers at once and pass in "n=50", you will have a sequence of 5000 random numbers. test Runs the sequence test on the data that was previously set using "set_data". Returns three items: The first is the root mean square of the bin residuals divided by the number of random numbers. It *could* be used as a measure for the quality of the random number generator and should be as close to zero as possible. A better metric is to compare the following two return values. The second return value is a reference to the array of frequencies. An example is in order here. Generating one million random numbers, I get: [0, 416765, 181078, 56318, 11486, 1056, 150] This means there were no sequences of length 0 (obvious), 416765 sequences of length 1, etc. There were no sequences of length 7 or greater. This example is a bad random number generator! (It's a linear congruent generator with "(a*x_i+c)%m" and "a=421", "c=64773", "m=259200", and "x_0=4711"). The third return value is similar in nature to the second in that it is a reference to an array containing sequence length frequencies. This one, however, contains the frequencies that would be expected for the given number of random numbers, were they uncorrelated. The number of bins has the maximum length of an occurring sequence as an upper limit. In the given example, you would get: (Dumped with Data::Dumper) $VAR1 = [ '0', '416666.75', '183333.1', '52777.64722222222222222222222222222222222', '11507.89523809523809523809523809523809524', '2033.72068452380952380952380952380952381', '303.1287808641975308641975308641975308642', # ... ]; Note that where I put in a "# ...", you would really see a couple more lines of numbers until the numbers go below an expected frequency of 0.1. For "n=1000000" and "k=7", you get about 39 sequences, "k=8" is expected to be found 4-5 times, etc. SUBROUTINES expected_frequency Returns the expected frequency of the sequence length "k" in a set of "n" random numbers assuming uncorrelated random numbers. Returns this as a Math::BigFloat. Expects "k" and "n" as arguments. This subroutine is memoized. (See Memoize.) faculty Computes the factulty of the first argument recursively as a Math::BigFloat. This subroutine is memoized. (See Memoize.) SEE ALSO Math::BigFloat, Memoize, Params::Util Random number generators: Math::Random::MT, Math::Random, Math::Random::OO, Math::TrulyRandom, "/dev/random" where available The algorithm was taken from: (German) Blobel, V., and Lohrmann, E. *Statistische und numerische Methoden der Datenanalyse*. Stuttgart, Leipzig: Teubner, 1998 AUTHOR Steffen Mueller, COPYRIGHT AND LICENSE Copyright (C) 2007 by Steffen Mueller This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.6 or, at your option, any later version of Perl 5 you may have available. Statistics-Test-Sequence-0.01/META.yml0000644000175000017500000000102510547464515016213 0ustar tseetsee--- #YAML:1.0 name: Statistics-Test-Sequence version: 0.01 abstract: Sequence correlation test for random numbers license: perl generated_by: ExtUtils::MakeMaker version 6.31 distribution_type: module requires: Math::BigFloat: 0 Memoize: 0 Params::Util: 0 meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.2.html version: 1.2 author: - Steffen Mueller