Bloom-Filter-1.0/0000755000076500007650000000000010563443334014366 5ustar maciejmaciej00000000000000Bloom-Filter-1.0/Changes0000644000076500007650000000022210077302045015646 0ustar maciejmaciej00000000000000Revision history for Perl extension Filter. 0.01 Fri Mar 26 10:45:51 2004 - original version; created by h2xs 1.21 with options -XPA Filter Bloom-Filter-1.0/Filter.pm0000644000076500007650000001466110563443303016155 0ustar maciejmaciej00000000000000package Bloom::Filter; use strict; use warnings; use Carp; use Digest::SHA1 qw/sha1 sha1_base64/; our $VERSION = '1.0'; =head1 NAME Bloom::Filter - Sample Perl Bloom filter implementation =head1 DESCRIPTION A Bloom filter is a probabilistic algorithm for doing existence tests in less memory than a full list of keys would require. The tradeoff to using Bloom filters is a certain configurable risk of false positives. This module implements a simple Bloom filter with configurable capacity and false positive rate. Bloom filters were first described in a 1970 paper by Burton Bloom, see L. =head1 SYNOPSIS use Bloom::Filter my $bf = Bloom::Filter->new( capacity => 10, error_rate => .001 ); $bf->add( @keys ); while ( <> ) { chomp; print "Found $_\n" if $bf->check( $_ ); } =head1 CONSTRUCTORS =over =item new %PARAMS Create a brand new instance. Allowable params are C, C. =cut sub new { my ( $class, %params ) = @_; my $self = { # some defaults error_rate => 0.001, capacity => 100, %params, # internal data key_count => 0, filter_length => 0, num_hash_funcs => 0, salts => [], }; bless $self, $class; $self->init(); return $self; } =item init Calculates the best number of hash functions and optimum filter length, creates some random salts, and generates a blank bit vector. Called automatically by constructor. =cut sub init { my ( $self ) = @_; # some sanity checks croak "Capacity must be greater than zero" unless $self->{capacity}; croak "Error rate must be greater than zero" unless $self->{error_rate}; croak "Error rate cannot exceed 1" unless $self->{error_rate} < 1; my ( $length, $num_funcs ) = $self->_calculate_shortest_filter_length ($self->{capacity}, $self->{error_rate} ); $self->{num_hash_funcs} = $num_funcs; $self->{filter_length} = $length; # create some random salts; my %collisions; while ( scalar keys %collisions < $self->{num_hash_funcs} ) { $collisions{rand()}++; } $self->{salts} = [ keys %collisions ]; # make an empty filter $self->{filter} = pack( "b*", '0' x $self->{filter_length} ); # make some blank vectors to use $self->{blankvec} = pack( "N", 0 ); return 1; } =back =head1 ACCESSORS =over =item capacity Returns the total capacity of the Bloom filter =cut sub capacity { $_[0]->{capacity} }; =item error_rate Returns the configured maximum error rate =cut sub error_rate { $_[0]->{error_rate} }; =item length Returns the length of the Bloom filter in bits =cut sub length { $_[0]->{filter_length} }; =item key_count Returns the number of items currently stored in the filter =cut sub key_count { $_[0]->{key_count} }; =item on_bits Returns the number of 'on' bits in the filter =cut sub on_bits { my ( $self ) = @_; return unless $self->{filter}; return unpack( "%32b*", $self->{filter}) } =item salts Returns the list of salts used to create the hash functions =cut sub salts { my ( $self ) = @_; return unless exists $self->{salts} and ref $self->{salts} and ref $self->{salts} eq 'ARRAY'; return @{ $self->{salts} }; } =back =head1 PUBLIC METHODS =over =item add @KEYS Adds the list of keys to the filter. Will fail, return C and complain if the number of keys in the filter exceeds the configured capacity. =cut sub add { my ( $self, @keys ) = @_; return unless @keys; # Hash our list of keys into the empty filter my @salts = @{ $self->{salts} } or croak "No salts found, cannot make bitmask"; foreach my $key ( @keys ) { if ($self->{key_count} >= $self->{capacity}) { carp "Exceeded filter capacity"; return; } # flip the appropriate bits on vec($self->{filter}, $_, 1) = 1 foreach @{$self->_get_cells($key)}; $self->{key_count}++; } return 1; } =item check @KEYS Checks the provided key list against the Bloom filter, and returns a list of equivalent length, with true or false values depending on whether there was a match. =cut sub check { my ( $self, @keys ) = @_; return unless @keys; my @result; # A match occurs if every bit we check is on foreach my $key ( @keys ) { my $match = 1; foreach my $cell (@{$self->_get_cells($key)} ) { $match = vec( $self->{filter}, $cell, 1 ) ; last unless $match; } push @result, $match; } return ( wantarray() ? @result : $result[0] ); } =back =head1 INTERNAL METHODS =over =item _calculate_shortest_filter_length CAPACITY ERR_RATE Given a desired error rate and maximum capacity, returns the optimum combination of vector length (in bits) and number of hash functions to use in building the filter, where "optimum" means shortest vector length. =cut sub _calculate_shortest_filter_length { my ( $self, $num_keys, $error_rate ) = @_; my $lowest_m; my $best_k = 1; foreach my $k ( 1..100 ) { my $m = (-1 * $k * $num_keys) / ( log( 1 - ($error_rate ** (1/$k)))); if ( !defined $lowest_m or ($m < $lowest_m) ) { $lowest_m = $m; $best_k = $k; } } $lowest_m = int( $lowest_m ) + 1; return ( $lowest_m, $best_k ); } =item _get_cells KEY Given a key, hashes it using the list of salts and returns an array of cell indexes corresponding to the key. =cut sub _get_cells { my ( $self, $key ) = @_; croak "Filter length is undefined" unless $self->{filter_length}; my @salts = @{ $self->{salts} } or croak "No salts found, cannot make bitmask"; my @cells; foreach my $salt ( @salts ){ my $hash = sha1( $key, $salt ); # blank 32 bit vector my $vec = $self->{blankvec}; # split the 160-bit hash into five 32-bit ints # and XOR the pieces together my @pieces = map {pack( "N", $_ )} unpack("N*", $hash ); $vec = $_ ^ $vec foreach @pieces; # Calculate bit offset by modding my $result = unpack( "N", $vec ); my $bit_offset = $result % $self->{filter_length}; push @cells, $bit_offset; } return \@cells; } =back =head1 AUTHOR Maciej Ceglowski Emaciej@ceglowski.comE =head1 CHANGELOG Feb 2007 big speedup by Dmitriy Ryaboy Edmitriy.ryaboy@ask.comE (thanks!) =head1 COPYRIGHT AND LICENSE (c) 2004 Maciej Ceglowski This is free software, distributed under version 2 of the GNU Public License (GPL). =cut 1; Bloom-Filter-1.0/Makefile.PL0000644000076500007650000000077110562223352016340 0ustar maciejmaciej00000000000000use ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( 'NAME' => 'Bloom::Filter', 'VERSION_FROM' => 'Filter.pm', # finds $VERSION 'PREREQ_PM' => {}, # e.g., Module::Name => 1.1 ($] >= 5.005 ? ## Add these new keywords supported since 5.005 (ABSTRACT_FROM => 'Filter.pm', # retrieve abstract from module AUTHOR => 'A. U. Thor ') : ()), ); Bloom-Filter-1.0/MANIFEST0000644000076500007650000000023410562223407015512 0ustar maciejmaciej00000000000000Changes Filter.pm Makefile.PL MANIFEST README t/add.t t/check.t t/defaults.t META.yml Module meta-data (added by MakeMaker) Bloom-Filter-1.0/META.yml0000644000076500007650000000045110563443334015637 0ustar maciejmaciej00000000000000# http://module-build.sourceforge.net/META-spec.html #XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# name: Bloom-Filter version: 1.0 version_from: Filter.pm installdirs: site requires: distribution_type: module generated_by: ExtUtils::MakeMaker version 6.17 Bloom-Filter-1.0/README0000644000076500007650000000170710077302045015244 0ustar maciejmaciej00000000000000Filter version 0.01 =================== The README is used to introduce the module and provide instructions on how to install the module, any machine dependencies it may have (for example C compilers and installed libraries) and any other information that should be provided before the module is installed. A README file is required for CPAN modules since CPAN extracts the README file from a module distribution so that people browsing the archive can use it get an idea of the modules uses. It is usually a good idea to provide version information here so that people can decide whether fixes for the module are worth downloading. INSTALLATION To install this module type the following: perl Makefile.PL make make test make install DEPENDENCIES This module requires these other modules and libraries: blah blah blah COPYRIGHT AND LICENCE Put the correct copyright and licence information here. Copyright (C) 2004 A. U. Thor blah blah blah Bloom-Filter-1.0/t/0000755000076500007650000000000010563443334014631 5ustar maciejmaciej00000000000000Bloom-Filter-1.0/t/add.t0000644000076500007650000000126110077302045015537 0ustar maciejmaciej00000000000000#!/usr/bin/perl -w use Test::More qw/no_plan/; use Bloom::Filter; my $bf = Bloom::Filter->new(); # default capacity should be 100; my @salts = $bf->salts(); is( scalar @salts, 10, "Correct default number of salts" ); my @keys = qw/Hansel Gretel/; is( $bf->key_count(), 0, "No keys" ); ok( $bf->add( $keys[0] ), "Added key" ); is( $bf->key_count(), 1, "Key count incremented" ); ok( $bf->add( $keys[0] ), "Added key" ); is( $bf->key_count(), 2, "Key count incremented" ); for (1..98) { $bf->add( $_ ) }; ok( !$bf->add( "last key" ), "Capacity exceeded" ); $bf = Bloom::Filter->new(); ok( $bf->add( @keys ), "Added multiple keys" ); is( $bf->key_count(), 2, "Correct key count" );Bloom-Filter-1.0/t/check.t0000644000076500007650000000073510077302045016071 0ustar maciejmaciej00000000000000#!/usr/bin/perl -w use Test::More qw/no_plan/; use Bloom::Filter; my $bf = Bloom::Filter->new(); # default capacity should be 100; my @salts = $bf->salts(); is( scalar @salts, 10, "Correct default number of salts" ); my @keys = qw/Hansel Gretel/; ok( $bf->add( @keys ), "Added key" ); ok( $bf->check( "Hansel" ), "Found key 'Hansel' in filter" ); ok( !$bf->check( "Herman" ), "Did not find key 'Herman'" ); ok( $bf->check( "Gretel" ), "Found key 'Gretel' in filter" ); Bloom-Filter-1.0/t/defaults.t0000644000076500007650000000104310077302045016614 0ustar maciejmaciej00000000000000#!/usr/bin/perl -w use Test::More qw/no_plan/; use Bloom::Filter; my $bf = Bloom::Filter->new(); # default capacity should be 100; is( $bf->capacity(), 100, "Default capacity ok" ); is( $bf->error_rate(), 0.001, "Default error rate ok" ); is( $bf->length(), 1438, "Length calculated properly" ); my $bf2 = Bloom::Filter->new( capacity => 1092, error_rate => .00001); is( $bf2->capacity(), 1092, "Custom capacity ok" ); is( $bf2->error_rate(), 0.00001, "Custom error rate ok" ); is( $bf2->length(), 26172, "Length calculated properly" );