Algorithm-NaiveBayes-0.04/0000755000076500007650000000000010632251270013533 5ustar kenkenAlgorithm-NaiveBayes-0.04/Build.PL0000444000076500007650000000036410632251270015030 0ustar kenkenuse Module::Build; my $build = Module::Build->new ( module_name => 'Algorithm::NaiveBayes', license => 'perl', create_makefile_pl => 'traditional', requires => { 'List::Util' => 0, }, ); $build->create_build_script; Algorithm-NaiveBayes-0.04/Changes0000444000076500007650000000262510632251270015031 0ustar kenkenRevision history for Perl extension Algorithm::NaiveBayes. - Fixed a runtime error in the Gaussian model - the code died looking for a rescale() function, which I had forgotten to import. [Manju Putcha] - Added save_state() and restore_state() methods, which will help consumers (like AI::Categorizer) avoid a pesky "can't find method" error when using a restored model. 0.03 Mon May 17 22:11:00 CDT 2004 - The double-loop inside the predict() method has been turned inside-out, so that the outer loop is now over the new attributes, rather than over the different labels in the training set. - Some internal changes for making customization easier: moved the model-creation code to new internal classes, making it easier to write additional classes that handle model creation differently. - Moved some of the numerical utility functions to a Algorithm::NaiveBayes::Util module. - We now use sum() and max() from List::Util rather than implementing our own. The ones in List::Util are considerably faster. - Added a copyright and license statement. [Spotted by Alexey Tourbin] 0.02 Fri May 23 13:36:48 CDT 2003 - The do_purge() method was called during training regardless of whether the 'purge' flag was set. This has been fixed. 0.01 Tue Mar 18 18:45:34 2003 - original version; created by extracting the NaiveBayes code out of AI::Categorizer::Learner::NaiveBayes Algorithm-NaiveBayes-0.04/INSTALL0000444000076500007650000000074610632251270014571 0ustar kenken Installation instructions for Algorithm::NaiveBayes To install this module, do this: perl Build.PL ./Build ./Build test ./Build install Alternatively, if you don't want to use the newer Module::Build module, you can follow the standard steps for installing most Perl modules: perl Makefile.PL make make test make install Or you may use the CPAN.pm module, which will automatically execute these steps for you. See 'perldoc CPAN' for the details. -Ken Algorithm-NaiveBayes-0.04/lib/0000755000076500007650000000000010632251270014301 5ustar kenkenAlgorithm-NaiveBayes-0.04/lib/Algorithm/0000755000076500007650000000000010632251270016227 5ustar kenkenAlgorithm-NaiveBayes-0.04/lib/Algorithm/NaiveBayes/0000755000076500007650000000000010632251270020255 5ustar kenkenAlgorithm-NaiveBayes-0.04/lib/Algorithm/NaiveBayes/Model/0000755000076500007650000000000010632251270021315 5ustar kenkenAlgorithm-NaiveBayes-0.04/lib/Algorithm/NaiveBayes/Model/Discrete.pm0000444000076500007650000000303010632251270023407 0ustar kenkenpackage Algorithm::NaiveBayes::Model::Discrete; use strict; use base qw(Algorithm::NaiveBayes); use Algorithm::NaiveBayes::Util qw(rescale); sub do_add_instance { my ($self, $attributes, $labels, $data) = @_; foreach my $label ( @$labels ) { my $mylabel = $data->{labels}{$label} ||= {}; $mylabel->{count}++; while (my ($attr, $value) = each %$attributes) { $mylabel->{attrs}{$attr}{$value}++; } } } sub do_train { my ($self, $training_data) = @_; my $m = {}; my $instances = $self->instances; my $labels = $training_data->{labels}; my $probs = $m->{probs} = {}; # Calculate the log-probabilities for each category foreach my $label ($self->labels) { $m->{prior_probs}{$label} = log($labels->{$label}{count} / $instances); my $denominator = log($labels->{$label}{count}); while (my ($attribute, $values) = each %{ $labels->{$label}{attrs} }) { while (my ($value, $count) = each %$values) { $probs->{$attribute}{$label}{$value} = log($count) - $denominator; } } } return $m; } sub do_predict { my ($self, $m, $newattrs) = @_; # Note that we're using the log(prob) here. That's why we add instead of multiply. my %scores = %{$m->{prior_probs}}; while (my ($feature, $value) = each %$newattrs) { next unless exists $m->{probs}{$feature}; # Ignore totally unseen features while (my ($label, $values) = each %{$m->{probs}{$feature}}) { $scores{$label} += ($values->{$value} || 0); } } rescale \%scores; return \%scores; } 1; Algorithm-NaiveBayes-0.04/lib/Algorithm/NaiveBayes/Model/Frequency.pm0000444000076500007650000000444210632251270023616 0ustar kenkenpackage Algorithm::NaiveBayes::Model::Frequency; use strict; use Algorithm::NaiveBayes::Util qw(sum_hash add_hash max rescale); use base qw(Algorithm::NaiveBayes); sub new { my $self = shift()->SUPER::new(@_); $self->training_data->{attributes} = {}; $self->training_data->{labels} = {}; return $self; } sub do_add_instance { my ($self, $attributes, $labels, $training_data) = @_; add_hash($training_data->{attributes}, $attributes); my $mylabels = $training_data->{labels}; foreach my $label ( @$labels ) { $mylabels->{$label}{count}++; add_hash($mylabels->{$label}{attributes} ||= {}, $attributes); } } sub do_train { my ($self, $training_data) = @_; my $m = {}; my $instances = $self->instances; my $labels = $training_data->{labels}; $m->{attributes} = $training_data->{attributes}; my $vocab_size = keys %{ $m->{attributes} }; # Calculate the log-probabilities for each category foreach my $label ($self->labels) { $m->{prior_probs}{$label} = log($labels->{$label}{count} / $instances); # Count the number of tokens in this cat my $label_tokens = sum_hash($labels->{$label}{attributes}); # Compute a smoothing term so P(word|cat)==0 can be avoided $m->{smoother}{$label} = -log($label_tokens + $vocab_size); # P(attr|label) = $count/$label_tokens (simple) # P(attr|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing) # log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size) my $denominator = log($label_tokens + $vocab_size); while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) { $m->{probs}{$label}{$attribute} = log($count + 1) - $denominator; } } return $m; } sub do_predict { my ($self, $m, $newattrs) = @_; # Note that we're using the log(prob) here. That's why we add instead of multiply. my %scores = %{$m->{prior_probs}}; while (my ($feature, $value) = each %$newattrs) { next unless exists $m->{attributes}{$feature}; # Ignore totally unseen features while (my ($label, $attributes) = each %{$m->{probs}}) { $scores{$label} += ($attributes->{$feature} || $m->{smoother}{$label})*$value; # P($feature|$label)**$value } } rescale(\%scores); return \%scores; } 1; Algorithm-NaiveBayes-0.04/lib/Algorithm/NaiveBayes/Model/Gaussian.pm0000444000076500007650000000327010632251270023425 0ustar kenkenpackage Algorithm::NaiveBayes::Model::Gaussian; use strict; use base qw(Algorithm::NaiveBayes); use Algorithm::NaiveBayes::Util qw(sum variance rescale); use constant Pi => 4*atan2(1, 1); sub do_add_instance { my ($self, $attributes, $labels, $training_data) = @_; foreach my $label ( @$labels ) { my $mylabel = $training_data->{labels}{$label} ||= {}; $mylabel->{count}++; while (my ($attr, $value) = each %$attributes) { push @{$mylabel->{attrs}{$attr}}, $value; } } } sub do_train { my ($self, $training_data) = @_; my $m = {}; my $instances = $self->instances; my $labels = $training_data->{labels}; while (my ($label, $data) = each %$labels) { $m->{prior_probs}{$label} = log($labels->{$label}{count} / $instances); # Calculate the mean & stddev for each label-attribute combination while (my ($attr, $values) = each %{$data->{attrs}}) { my $mean = sum($values) / @$values; my $var = variance($values, $mean) or next; # Can't use variance of zero @{ $m->{summary}{$attr}{$label} }{'mean', 'var'} = ($mean, $var); } } return $m; } sub do_predict { my ($self, $m, $newattrs) = @_; my %scores = %{$m->{prior_probs}}; while (my ($feature, $value) = each %$newattrs) { next unless exists $m->{summary}{$feature}; # Ignore totally unseen features while (my ($label, $data) = each %{$m->{summary}{$feature}}) { my ($mean, $var) = @{$data}{'mean', 'var'}; # This is simplified from # += log( 1/sqrt($var*2*Pi) * exp(-($value-$mean)**2/(2*$var)) ); $scores{$label} -= 0.5*(log($var) + log(2*Pi) + ($value-$mean)**2/$var); } } rescale(\%scores); return \%scores; } 1; Algorithm-NaiveBayes-0.04/lib/Algorithm/NaiveBayes/Util.pm0000444000076500007650000000164510632251270021534 0ustar kenkenpackage Algorithm::NaiveBayes::Util; use strict; use base qw(Exporter); use vars qw(@EXPORT_OK); @EXPORT_OK = qw(sum sum_hash max variance add_hash rescale); use List::Util qw(max sum); sub sum_hash { my $href = shift; return sum(values %$href); } sub variance { my $array = shift; return 0 unless @$array > 1; my $mean = @_ ? shift : sum($array) / @$array; my $var = 0; $var += ($_ - $mean)**2 foreach @$array; return $var / (@$array - 1); } sub add_hash { my ($first, $second) = @_; foreach my $k (keys %$second) { $first->{$k} += $second->{$k}; } } sub rescale { my ($scores) = @_; # Scale everything back to a reasonable area in logspace (near zero), un-loggify, and normalize my $total = 0; my $max = max(values %$scores); foreach (values %$scores) { $_ = exp($_ - $max); $total += $_**2; } $total = sqrt($total); foreach (values %$scores) { $_ /= $total; } } 1; Algorithm-NaiveBayes-0.04/lib/Algorithm/NaiveBayes.pm0000444000076500007650000002130710632251270020614 0ustar kenkenpackage Algorithm::NaiveBayes; use strict; use Storable; use vars qw($VERSION); $VERSION = '0.04'; sub new { my $package = shift; my $self = bless { version => $VERSION, purge => 1, model_type => 'Frequency', @_, instances => 0, training_data => {}, }, $package; if ($package eq __PACKAGE__) { # Bless into the proper subclass return $self->_load_model_class->new(@_); } return bless $self, $package; } sub _load_model_class { my $self = shift; die "model_class cannot be set to " . __PACKAGE__ if ($self->{model_class}||'') eq __PACKAGE__; my $package = $self->{model_class} || __PACKAGE__ . "::Model::" . $self->{model_type}; unless ($package->can('new')) { eval "use $package"; die $@ if $@; } return $package; } sub save_state { my ($self, $path) = @_; Storable::nstore($self, $path); } sub restore_state { my ($pkg, $path) = @_; my $self = Storable::retrieve($path) or die "Can't restore state from $path: $!"; $self->_load_model_class; return $self; } sub add_instance { my ($self, %params) = @_; for ('attributes', 'label') { die "Missing required '$_' parameter" unless exists $params{$_}; } for ($params{label}) { $_ = [$_] unless ref; @{$self->{labels}}{@$_} = (); } $self->{instances}++; $self->do_add_instance($params{attributes}, $params{label}, $self->{training_data}); } sub labels { keys %{ $_[0]->{labels} } } sub instances { $_[0]->{instances} } sub training_data { $_[0]->{training_data} } sub train { my $self = shift; $self->{model} = $self->do_train($self->{training_data}); $self->do_purge if $self->purge; } sub do_purge { my $self = shift; delete $self->{training_data}; } sub purge { my $self = shift; $self->{purge} = shift if @_; return $self->{purge}; } sub predict { my ($self, %params) = @_; my $newattrs = $params{attributes} or die "Missing 'attributes' parameter for predict()"; return $self->do_predict($self->{model}, $newattrs); } 1; __END__ # Below is stub documentation for your module. You better edit it! =head1 NAME Algorithm::NaiveBayes - Bayesian prediction of categories =head1 SYNOPSIS use Algorithm::NaiveBayes; my $nb = Algorithm::NaiveBayes->new; $nb->add_instance (attributes => {foo => 1, bar => 1, baz => 3}, label => 'sports'); $nb->add_instance (attributes => {foo => 2, blurp => 1}, label => ['sports', 'finance']); ... repeat for several more instances, then: $nb->train; # Find results for unseen instances my $result = $nb->predict (attributes => {bar => 3, blurp => 2}); =head1 DESCRIPTION This module implements the classic "Naive Bayes" machine learning algorithm. It is a well-studied probabilistic algorithm often used in automatic text categorization. Compared to other algorithms (kNN, SVM, Decision Trees), it's pretty fast and reasonably competitive in the quality of its results. A paper by Fabrizio Sebastiani provides a really good introduction to text categorization: L =head1 METHODS =over 4 =item new() Creates a new C object and returns it. The following parameters are accepted: =over 4 =item purge If set to a true value, the C method will be invoked during C. The default is true. Set this to a false value if you'd like to be able to add additional instances after training and then call C again. =back =item add_instance( attributes =E HASH, label =E STRING|ARRAY ) Adds a training instance to the categorizer. The C parameter contains a hash reference whose keys are string attributes and whose values are the weights of those attributes. For instance, if you're categorizing text documents, the attributes might be the words of the document, and the weights might be the number of times each word occurs in the document. The C