News-Scan-0.53/0040755006374700003100000000000007621721716012712 5ustar gbaconPremiumNews-Scan-0.53/README0100644006374700003100000000406507565154375013604 0ustar gbaconPremium Description ----------- This is the Perl 5 News-Scan distribution. It requires perl version 5.004 or later. This distribution provides a mechanism for collecting articles from a set of Usenet newsgroups and then creating statistical reports about those groups. IMPORTANT!!! IMPORTANT!!! IMPORTANT!!! IMPORTANT!!! IMPORTANT!!! IMPORTANT!!! IMPORTANT!!! IMPORTANT!!! IMPORTANT!!! IMPORTANT!!! This release of News-Scan is incompatible with prior releases. This should be considered alpha-quality software, so the internal implementation is subject to change without notice, although I will only make incompatible changes only when absolutely necessary. Prerequisites ------------- To use NewsScan, you need to have installed the following packages: TimeDate-#.## MailTools#.## IO-#.## If you want to collect articles from an NNTP server, you'll also need to install libnet-#.## Installation ------------ Install NewsScan by running the following commands: perl Makefile.PL make make install Examples -------- Please see the eg/ directory after you have unpacked the distribution for examples of using this library to gather and report statistics for Usenet newsgroups. Bugs, Problems, Suggestions --------------------------- Please send any suggestions, bug reports, complaints, et cetera to Greg Bacon . Acknowledgements ---------------- This software is based on Mike Lee's posters distribution and also on software posted to comp.lang.perl.misc by Christian Murphy . Many, many thanks are also in order to Gary Niemcewicz for all his helpful thoughts, comments, and bug reports. Chaim Frenkel has also contributed immensely with his extremely helpful comments, suggestions, and questions. Thanks to Henrik Gemal, Gerrit P. Haase, and Oyvind Gronnesby for helpful suggestions and bug reports. Copyright --------- Copyright (c) 1997-2002 Greg Bacon. This library is free software; you can distribute and/or modify it under the same terms as Perl itself. News-Scan-0.53/eg/0040755006374700003100000000000007621721716013305 5ustar gbaconPremiumNews-Scan-0.53/eg/news-scan0100755006374700003100000000170007621721327015122 0ustar gbaconPremium#! /usr/local/bin/perl -w use strict; use Data::Dumper 'DumperX'; use News::Scan; my %aliases = ( 'gbacon@hiwaay.net' => 'gbacon@cs.uah.edu', ); my @exclude = ( 'perlfaq-suggestions\@(?:.*\.)?perl\.com', ); my $HOME = $ENV{HOME} || '/'; my $spool = "$HOME/spool/comp.lang.perl.misc"; my $scan = new News::Scan Group => 'comp.lang.perl.misc', Spool => $spool, Period => 7, Aliases => \%aliases, Exclude => \@exclude, From => 'spool'; unless (defined $scan) { die "$0: Failed to create News::Scan object!\n"; } elsif ($scan->error) { die "$0: Error: " . $scan->error . "\n"; } $scan->scan; if ($scan->error) { die "$0: Error: " . $scan->error . "\n"; } $Data::Dumper::Indent = 0; ## minimal whitespace $Data::Dumper::Purity = 1; ## get correct topology at any cost print DumperX $scan; exit; News-Scan-0.53/eg/avg-addy0100755006374700003100000000316707565154433014736 0ustar gbaconPremium#! /usr/bin/perl -w # avg-addy - compute the average of a list of email addresses # Copyright (c) 1998. Greg Bacon. All Rights Reserved. # This program is free software. You may distribute it or modify it # (perhaps both) under the terms of the Artistic License which comes # with the Perl Kit. use strict; use integer; my @name; my $namelen = 0; my @hname; my @hnamelens; my $total = 0; my $total_parts = 0; while (<>) { chomp; next unless /\S/; ## blank lines suck next unless /.@./; my($i,$j); $total++; s/^(.*?)@//; my $name = $1; $namelen += length $name; my @parts = split /\./, $_; $total_parts += @parts; $i = 0; foreach my $ch (split //, $name) { $name[$i] ||= 0; $name[$i] += ord($ch); $i++; } $i = 0; foreach my $part (@parts) { $hnamelens[$i] ||= 0; $hnamelens[$i] += length $part; $j = 0; foreach my $ch (split //, $part) { $hname[$i][$j] ||= 0; $hname[$i][$j] += ord($ch); $j++; } $i++; } } my $avg = ''; ## cull what we don't need my $avg_name_len = $namelen / $total; splice @name, $avg_name_len; my $avg_num_parts = $total_parts / $total; splice @hname, $avg_num_parts; splice @hnamelens, $avg_num_parts; foreach my $n (@name) { $avg .= chr($n / $total); } $avg .= '@'; for (my $i = 0; $i < @hname; $i++) { my $avg_len = $hnamelens[$i] / $total; splice @{$hname[$i]}, $avg_len; foreach my $n (@{$hname[$i]}) { $avg .= chr($n / $total); } $avg .= '.'; } $avg =~ s/\.$//; print "Average address: $avg\n"; News-Scan-0.53/eg/new-posters0100755006374700003100000002517007075371721015523 0ustar gbaconPremium#! /usr/local/bin/perl -w use strict; use POSIX; use News::Scan; my $Posters; my $Articles = 0; my $Volume = 0; my $HVol = 0; my $HLns = 0; my $BVol = 0; my $BLns = 0; my $OVol = 0; my $OLns = 0; my $SVol = 0; my $SLns = 0; ## subs sub in_kb { my $val = shift; sprintf "%.1f", ($val / 1024); } sub commify { local $_ = shift; 1 while s/^(-?\d+)(\d\d\d)/$1,$2/; $_; } sub places { my $acc = shift; my $val = shift; sprintf "%.${acc}f", $val } sub median { my @values = sort { $a <=> $b } @_; my $n = @values; if ($n % 2 == 1) { return $values[$n / 2]; } else { return places 1, ($values[$n / 2] + $values[$n/2 - 1]) / 2; } } sub mode { my %scores; local $_; for (@_) { $scores{$_}++; } my @scores = sort { $scores{$b} <=> $scores{$a} } keys %scores; my $high = $scores[0]; my $freq = $scores{$high}; my $i = 0; for (@scores) { if ($scores{$_} != $freq) { splice @scores, $i; last; } $i++; } if (@scores == 1) { return ($high, $freq); } elsif (@scores == 2) { return (join(" and ", @scores), $freq); } else { my $last = pop @scores; my $ret; $ret = join ", ", @scores; $ret .= ", and $last"; return ($ret, $freq); } } sub stdev { my @values = @_; my $avg = shift; my $n = @values; my $sum = 0; local $_; for (@values) { $sum += ($_ - $avg) ** 2; } places(1, sqrt($sum / $n)); } sub print_header { my $scan = shift; my $group = $scan->name; my $period = $scan->period; my $quote_re = $scan->quote_re; my $earliest = strftime "%d %b %Y %H:%M:%S", gmtime($scan->earliest); my $latest = strftime "%d %b %Y %H:%M:%S", gmtime($scan->latest); print < Newsgroups: $group Subject: New posters to $group Following is a summary of articles from new posters spanning a $period day period, beginning at $earliest GMT and ending at $latest GMT. Notes ===== - A line in the body of a post is considered to be original if it does *not* match the regular expression /$quote_re/. - All text after the last cut line (/^-- \$/) in the body is considered to be the author's signature. - The scanner prefers the Reply-To: header over the From: header in determining the "real" e-mail address and name. - Original Content Rating (OCR) is the ratio of the original content volume to the total body volume. - Find the News-Scan distribution on the CPAN! - Please send all comments to Greg Bacon . - Copyright (c) 1998 Greg Bacon. All Rights Reserved. Verbatim copying and redistribution is permitted without royalty; alteration is not permitted. Redistribution and/or use for any commercial purpose is prohibited. EOF } sub totals { my $scan = shift; my $old = $scan->posters; my $posters = $Posters; my $num_posters = scalar keys %$posters; my $ppct = places 1, ($num_posters / scalar(keys(%$old)) * 100); my $num_articles = $Articles; my $apct = places 1, ($Articles / $scan->articles * 100); my $total_volume = in_kb $Volume; my $vpct = places 1, ($Volume / $scan->volume * 100); my $hdr_volume = in_kb $HVol; my $hdr_lines = commify $HLns; my $body_volume = in_kb $BVol; my $body_lines = commify $BLns; my $orig_volume = in_kb $OVol; my $orig_lines = commify $OLns; my $sig_volume = in_kb $SVol; my $sig_lines = commify $SLns; my $ocr = sprintf "%.3f", ($OVol / $BVol); print <articles } values %$posters; my $pmed = median @posts_by_poster; my($pmode, $pmode_score) = mode @posts_by_poster; my $psd = stdev @posts_by_poster, ($scan->articles / $num_posters); my $num_articles = $Articles; my $msg = places 1, ($Volume / $num_articles); my $hdr = places 1, ($HVol / $num_articles); my $hdr_lines = places 1, ($HLns / $num_articles); my $body = places 1, ($BVol / $num_articles); my $body_lines = places 1, ($BLns / $num_articles); my $orig = places 1, ($OVol / $num_articles); my $orig_lines = places 1, ($OLns / $num_articles); my $sig = places 1, ($SVol / $num_articles); my $sig_lines = places 1, ($SLns / $num_articles); print <[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->articles ] } values %$posters )[0 .. 9]; $top_total = 0; for (@top) { last unless defined $_; my $vol = sprintf "%5.1f (%5.1f/%5.1f/%5.1f)", $_->volume / 1024, $_->header_volume / 1024, $_->body_volume / 1024, $_->orig_volume / 1024; printf "%5d %26s %s\n", $_->articles, $vol, $_->attrib; $top_total += $_->articles; } printf "\nThese posters accounted for %.1f%% of all articles.\n", 100 * $top_total / $scan->articles; ## by volume print <[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->volume ] } values %$posters )[0 .. 9]; $top_total = 0; for (@top) { last unless defined $_; my $vol = sprintf "%5.1f (%5.1f/%5.1f/%5.1f)", $_->volume / 1024, $_->header_volume / 1024, $_->body_volume / 1024, $_->orig_volume / 1024; printf "%26s %5d %s\n", $vol, $_->articles, $_->attrib; $top_total += $_->volume; } printf "\nThese posters accounted for %.1f%% of the total volume.\n", 100 * $top_total / $scan->volume; ## top OCR print <[1] <=> $a->[1] } map { [ $_, ($_->orig_volume / $_->body_volume) ] } grep { $_->articles >= 3 } values %$posters )[0 .. 9]; for (@top) { last unless defined $_; printf "%.3f (%5.1f /%5.1f) %5d %s\n", $_->[1], $_->[0]->orig_volume / 1024, $_->[0]->body_volume / 1024, $_->[0]->articles, $_->[0]->attrib; } ## bottom OCR print <[1] <=> $b->[1] } map { [ $_, ($_->orig_volume / $_->body_volume) ] } grep { $_->articles >= 3 } values %$posters )[0 .. 9]; for (reverse @top) { next unless defined $_; printf "%.3f (%5.1f /%5.1f) %5d %s\n", $_->[1], $_->[0]->orig_volume / 1024, $_->[0]->body_volume / 1024, $_->[0]->articles, $_->[0]->attrib; } my $total = scalar keys %$posters; my $eligible = scalar grep { $_->articles >= 3 } values %$posters; my $pct = sprintf "%d", 100 * $eligible / $total; my $str = $eligible == 1 ? "One poster ($pct%)" : "$eligible posters ($pct%)"; print "\n$str had at least three posts.\n\n"; } sub top_xposts { my $scan = shift; my @top; my $posters = $Posters; local $_; print <[1] <=> $a->[1] } map { [ $_, $_->crossposts ] } values %$posters )[0 .. 9]; for (@top) { next unless defined $_; printf "%8d %s\n", $_->[1], $_->[0]->attrib; } } sub print_stats { my $scan = shift; totals $scan; avgs $scan; top_posters $scan; top_xposts $scan; } ## main my $dump = shift || die "Usage: $0 \n"; my $scan; { my $VAR1; ## from the Data::Dumper output open DUMP, $dump or die "$0: failed open $dump: $!\n"; local $/; my $data = ; $scan = eval $data; die "$0: Error evaluating dumpfile: $@\n" if $@; close DUMP; } $Posters = $scan->posters; ## take out the old posters as we find them open OLD, "posters" or die "Failed open posters: $!\n"; while () { chomp; delete $Posters->{$_}; } ## grab some info for (values %$Posters) { $Articles += $_->articles; $Volume += $_->volume; $HVol += $_->header_volume; $HLns += $_->header_lines; $BVol += $_->body_volume; $BLns += $_->body_lines; $OVol += $_->orig_volume; $OLns += $_->orig_lines; $SVol += $_->sig_volume; $SLns += $_->sig_lines; } print_header $scan; print_stats $scan; News-Scan-0.53/eg/update-posters0100755006374700003100000000145207565154433016214 0ustar gbaconPremium#! /usr/bin/perl -w use strict; use News::Scan; my %Posters; $0 =~ s!^.*/!!; ## read in posters we know if (open OLD, "posters") { while () { chomp; $Posters{$_}++; } close OLD; } for (@ARGV) { unless (open SCAN, $_) { warn "$0: failed open $_: $!\n"; next; } local $/; my $data = ; my $VAR1; eval $data; if ($@) { warn "$0: while evaluating $_: $@\n"; close SCAN; next; } my $p = $VAR1->posters; unless (ref $p) { warn "$0: no posters in $_!\n"; close SCAN; next; } for (keys %$p) { $Posters{$_}++; } close SCAN; } open OUT, ">posters" or die "$0: failed open >posters: $!\n"; for (sort keys %Posters) { print OUT "$_\n"; } News-Scan-0.53/eg/news-stats0100755006374700003100000004135107621721653015344 0ustar gbaconPremium#! /usr/local/bin/perl -w use strict; use POSIX; use News::Scan; # e.g., $From = 'Stats Poster '; # this value will be used in print_header our $From; BEGIN { unless ($From) { die "\$From is empty"; } } ## subs sub in_kb { my $val = shift; sprintf "%.1f", ($val / 1024); } sub commify { local $_ = shift; 1 while s/^(-?\d+)(\d\d\d)/$1,$2/; $_; } sub places { my $acc = shift; my $val = shift; sprintf "%.${acc}f", $val } sub median { my @values = sort { $a <=> $b } @_; my $n = @values; if ($n % 2 == 1) { return $values[$n / 2]; } else { return places 1, ($values[$n / 2] + $values[$n/2 - 1]) / 2; } } sub mode { my %scores; local $_; for (@_) { $scores{$_}++; } my @scores = sort { $scores{$b} <=> $scores{$a} } keys %scores; my $high = $scores[0]; my $freq = $scores{$high}; my $i = 0; for (@scores) { if ($scores{$_} != $freq) { splice @scores, $i; last; } $i++; } if (@scores == 1) { return ($high, $freq); } elsif (@scores == 2) { return (join(" and ", @scores), $freq); } else { my $last = pop @scores; my $ret; $ret = join ", ", @scores; $ret .= ", and $last"; return ($ret, $freq); } } sub stdev { my @values = @_; my $avg = shift; my $n = @values; my $sum = 0; local $_; for (@values) { $sum += ($_ - $avg) ** 2; } places(1, sqrt($sum / $n)); } sub print_header { my $scan = shift; my $group = $scan->name; my $period = $scan->period; my $quote_re = $scan->quote_re; my $earliest = strftime "%d %b %Y %H:%M:%S", gmtime($scan->earliest); my $latest = strftime "%d %b %Y %H:%M:%S", gmtime($scan->latest); my $year = (localtime)[5]; $year += 1900; print < - Please send all comments to Greg Bacon . - Copyright (c) $year Greg Bacon. Verbatim copying and redistribution is permitted without royalty; alteration is not permitted. Redistribution and/or use for any commercial purpose is prohibited. EOF } sub excluded { my $scan = shift; my @excludes = @{ $scan->excludes }; if (@excludes) { print <posters; my $num_posters = scalar keys %$posters; my $num_sigs = $scan->signatures; my $num_articles = $scan->articles; my $threads = $scan->threads; my $num_threads = scalar keys %$threads; my $total_volume = in_kb $scan->volume; my $hdr_volume = in_kb $scan->header_volume; my $hdr_lines = commify $scan->header_lines; my $body_volume = in_kb $scan->body_volume; my $body_lines = commify $scan->body_lines; my $orig_volume = in_kb $scan->orig_volume; my $orig_lines = commify $scan->orig_lines; my $sig_volume = in_kb $scan->sig_volume; my $sig_lines = commify $scan->sig_lines; my $ocr = sprintf "%.3f", ($scan->orig_volume / $scan->body_volume); print <posters; my $num_posters = scalar keys %$posters; my $posts_avg = places 1, ($scan->articles / $num_posters); my @posts_by_poster = map { $_->articles } values %$posters; my $pmed = median @posts_by_poster; my($pmode, $pmode_score) = mode @posts_by_poster; my $psd = stdev @posts_by_poster, ($scan->articles / $num_posters); my $threads = $scan->threads; my $num_threads = scalar keys %$threads; my $thr_avg = places 1, ($scan->articles / $num_threads); my @posts_by_thread = map { $_->articles } values %$threads; my $tmed = median @posts_by_thread; my($tmode, $tmode_score) = mode @posts_by_thread; my $tsd = stdev @posts_by_thread, ($scan->articles / $num_threads); my $num_articles = $scan->articles; my $msg = places 1, ($scan->volume / $num_articles); my $hdr = places 1, ($scan->header_volume / $num_articles); my $hdr_lines = places 1, ($scan->header_lines / $num_articles); my $body = places 1, ($scan->body_volume / $num_articles); my $body_lines = places 1, ($scan->body_lines / $num_articles); my $orig = places 1, ($scan->orig_volume / $num_articles); my $orig_lines = places 1, ($scan->orig_lines / $num_articles); my $sig = places 1, ($scan->sig_volume / $num_articles); my $sig_lines = places 1, ($scan->sig_lines / $num_articles); print <posters; ## by posts @top = grep defined, ( map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->articles ] } values %$posters )[0 .. 9]; $n = @top; $l = '=' x length $n; print <volume / 1024, $_->header_volume / 1024, $_->body_volume / 1024, $_->orig_volume / 1024; printf "%5d %26s %s\n", $_->articles, $vol, $_->attrib; $top_total += $_->articles; } printf "\nThese posters accounted for %.1f%% of all articles.\n", 100 * $top_total / $scan->articles; ## by volume @top = grep defined, ( map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->volume ] } values %$posters )[0 .. 9]; $n = @top; $l = '=' x length $n; print <volume / 1024, $_->header_volume / 1024, $_->body_volume / 1024, $_->orig_volume / 1024; printf "%26s %5d %s\n", $vol, $_->articles, $_->attrib; $top_total += $_->volume; } printf "\nThese posters accounted for %.1f%% of the total volume.\n", 100 * $top_total / $scan->volume; ## top orig volume @top = grep defined, ( sort { $b->[1] <=> $a->[1] } map { [ $_, $_->orig_volume ] } grep { $_->articles >= 5 } values %$posters )[0 .. 9]; $n = @top; $l = '=' x length $n; print <[0]->articles, $_->[0]->orig_volume / 1024, $_->[0]->attrib; $top_total += $_->[0]->orig_volume; } printf "\nThese posters accounted for %.1f%% of the original volume.\n", 100 * $top_total / $scan->orig_volume; ## top OCR @top = grep defined, ( sort { $b->[1] <=> $a->[1] } map { [ $_, ($_->orig_volume / $_->body_volume) ] } grep { $_->articles >= 5 } values %$posters )[0 .. 9]; $n = @top; $l = '=' x length $n; print <[1], $_->[0]->orig_volume / 1024, $_->[0]->body_volume / 1024, $_->[0]->articles, $_->[0]->attrib; } ## bottom OCR @top = grep defined, ( sort { $a->[1] <=> $b->[1] } map { [ $_, ($_->orig_volume / $_->body_volume) ] } grep { $_->articles >= 5 } values %$posters )[0 .. 9]; $n = @top; $l = '=' x length $n; print <[1], $_->[0]->orig_volume / 1024, $_->[0]->body_volume / 1024, $_->[0]->articles, $_->[0]->attrib; } my $total = scalar keys %$posters; my $eligible = scalar grep { $_->articles >= 5 } values %$posters; my $pct = sprintf "%d", 100 * $eligible / $total; my $str = $eligible == 1 ? "One poster ($pct%)" : "$eligible posters ($pct%)"; print "\n$str had at least five posts.\n\n"; } sub top_threads { my $scan = shift; my @top; my $threads = $scan->threads; my $top_total; my $n; my $l; local $_; ## by posts @top = grep defined, ( map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->articles ] } values %$threads )[0 .. 9]; $n = @top; $l = '=' x length $n; print <articles, $_->subject; $top_total += $_->articles; } printf "\nThese threads accounted for %.1f%% of all articles.\n", 100 * $top_total / $scan->articles; ## by volume @top = grep defined, ( map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->volume ] } values %$threads )[0 .. 9]; $n = @top; $l = '=' x length $n; print <volume / 1024, $_->header_volume / 1024, $_->body_volume / 1024, $_->orig_volume / 1024; printf "%26s %5d %s\n", $vol, $_->articles, $_->subject; $top_total += $_->volume; } printf "\nThese threads accounted for %.1f%% of the total volume.\n", 100 * $top_total / $scan->volume; ## top OCR @top = grep defined, ( sort { $b->[1] <=> $a->[1] } map { [ $_, ($_->orig_volume / $_->body_volume) ] } grep { $_->articles >= 5 } values %$threads )[0 .. 9]; $n = @top; $l = '=' x length $n; print <[1], $_->[0]->orig_volume / 1024, $_->[0]->body_volume / 1024, $_->[0]->articles, $_->[0]->subject; } ## bottom OCR @top = grep defined, ( sort { $a->[1] <=> $b->[1] } map { [ $_, ($_->orig_volume / $_->body_volume) ] } grep { $_->articles >= 5 } values %$threads )[0 .. 9]; $n = @top; $l = '=' x length $n; print <[1], $_->[0]->orig_volume / 1024, $_->[0]->body_volume / 1024, $_->[0]->articles, $_->[0]->subject; } my $total = scalar keys %$threads; my $eligible = scalar grep { $_->articles >= 5 } values %$threads; my $pct = sprintf "%d", 100 * $eligible / $total; my $str = $eligible == 1 ? "One thread ($pct%)" : "$eligible threads ($pct%)"; print "\n$str had at least five posts.\n\n"; } sub top_xposts { my $scan = shift; my $xposts = $scan->crossposts || return; my $posters = $scan->posters; my @top; my $n; my $l; local $_; @top = grep defined, ( sort { $b->[1] <=> $a->[1] } map { [ $_, $xposts->{$_} ] } keys %$xposts )[0 .. 9]; $n = @top; $l = '=' x length $n; print <[1], $_->[0]; } @top = grep defined, ( sort { $b->[1] <=> $a->[1] } grep { $_->[1] > 0 } map { [ $_, $_->crossposts ] } values %$posters )[0 .. 9]; $n = @top; $l = '=' x length $n; print <[1], $_->[0]->attrib; } } sub print_stats { my $scan = shift; excluded $scan; totals $scan; avgs $scan; top_posters $scan; top_threads $scan; top_xposts $scan; } ## main my $dump = shift || die "Usage: $0 \n"; my $scan; { my $VAR1; ## from the Data::Dumper output open DUMP, $dump or die "$0: failed open $dump: $!\n"; local $/ = undef; my $data = ; $scan = eval $data; die "$0: Error evaluating dumpfile: $@\n" if $@; close DUMP; } print_header $scan; print_stats $scan; News-Scan-0.53/eg/nntpget0100755006374700003100000000104207075371733014707 0ustar gbaconPremium#! /usr/local/bin/perl -w use News::Scan; my $HOME = $ENV{HOME} || '/'; my $scan = new News::Scan Group => 'comp.lang.perl.misc', Spool => "$HOME/spool/comp.lang.perl.misc", From => 'nntp', NNTPServer => 'news.uah.edu'; if (not defined $scan) { die "Failed to create News::Group object\n"; } elsif ($scan->error) { die "Error: " . $scan->error . "\n"; } $scan->collect; if ($scan->error) { die "Error: " . $scan->error . "\n"; } exit; News-Scan-0.53/eg/news-stats.deutsch0100755006374700003100000003301507565154023016776 0ustar gbaconPremium#!/usr/bin/perl -w # Thank you to Gerrit P. Haase for translating this program's # output to German use strict; use POSIX; use News::Scan; ## subs sub in_kb { my $val = shift; sprintf "%.1f", ($val / 1024); } sub commify { local $_ = shift; 1 while s/^(-?\d+)(\d\d\d)/$1,$2/; $_; } sub places { my $acc = shift; my $val = shift; sprintf "%.${acc}f", $val } sub median { my @values = sort { $a <=> $b } @_; my $n = @values; if ($n % 2 == 1) { return $values[$n / 2]; } else { return places 1, ($values[$n / 2] + $values[$n/2 - 1]) / 2; } } sub mode { my %scores; local $_; for (@_) { $scores{$_}++; } my @scores = sort { $scores{$b} <=> $scores{$a} } keys %scores; my $high = $scores[0]; my $freq = $scores{$high}; my $i = 0; for (@scores) { if ($scores{$_} != $freq) { splice @scores, $i; last; } $i++; } if (@scores == 1) { return ($high, $freq); } elsif (@scores == 2) { return (join(" and ", @scores), $freq); } else { my $last = pop @scores; my $ret; $ret = join ", ", @scores; $ret .= ", and $last"; return ($ret, $freq); } } sub stdev { my @values = @_; my $avg = shift; my $n = @values; my $sum = 0; local $_; for (@values) { $sum += ($_ - $avg) ** 2; } places(1, sqrt($sum / $n)); } sub print_header { my $scan = shift; my $group = $scan->name; my $period = $scan->period; my $quote_re = $scan->quote_re; my $earliest = strftime "%d %b %Y %H:%M:%S", gmtime($scan->earliest); my $latest = strftime "%d %b %Y %H:%M:%S", gmtime($scan->latest); print < Newsgroup: $group Subject: Statistik fuer $group Zusammenfassung von Artikeln im Zeitraum von $period Tagen, erster Artikel in dem Zeitraum $earliest GMT und letzter Artikel $latest GMT. Anmerkungen =========== - Eine Zeile im Body wird als Original betrachtet wenn sie *nicht* den regulaeren Ausdruck /$quote_re/ trifft. - Jeder Text nach dem Sigtrenner (/^-- \$/) im Body wird als Signatur des Authors betrachtet. - Der Scanner bevorzugt den Reply-To: Header dem From: Header um die 'echte' Emailadresse herauszufinden. - Original Content Rating ist das Verhältnis des eigenen Nachrichtentexts zum gesamten Nachrichtentext. - Kommentare an Gerrit P. Haase . - Die News-Scan Distribution befindet sich auf CPAN! - Copyright (c) 2002 Greg Bacon. Kopieren und Weiterverteilung ist gestattet nur ohne Gebuehr; Aenderungen sind nicht erlaubt. Weitervertrieb und/oder Nutzung fuer irgend einen kommerziellen Zweck ist verboten. EOF } sub excluded { my $scan = shift; my @excludes = @{ $scan->excludes }; if (@excludes) { print <posters; my $num_posters = scalar keys %$posters; my $num_sigs = $scan->signatures; my $num_articles = $scan->articles; my $threads = $scan->threads; my $num_threads = scalar keys %$threads; my $total_volume = in_kb $scan->volume; my $hdr_volume = in_kb $scan->header_volume; my $hdr_lines = commify $scan->header_lines; my $body_volume = in_kb $scan->body_volume; my $body_lines = commify $scan->body_lines; my $orig_volume = in_kb $scan->orig_volume; my $orig_lines = commify $scan->orig_lines; my $sig_volume = in_kb $scan->sig_volume; my $sig_lines = commify $scan->sig_lines; my $ocr = sprintf "%.3f", ($scan->orig_volume / $scan->body_volume); print <posters; my $num_posters = scalar keys %$posters; my $posts_avg = places 1, ($scan->articles / $num_posters); my @posts_by_poster = map { $_->articles } values %$posters; my $pmed = median @posts_by_poster; my($pmode, $pmode_score) = mode @posts_by_poster; my $psd = stdev @posts_by_poster, ($scan->articles / $num_posters); my $threads = $scan->threads; my $num_threads = scalar keys %$threads; my $thr_avg = places 1, ($scan->articles / $num_threads); my @posts_by_thread = map { $_->articles } values %$threads; my $tmed = median @posts_by_thread; my($tmode, $tmode_score) = mode @posts_by_thread; my $tsd = stdev @posts_by_thread, ($scan->articles / $num_threads); my $num_articles = $scan->articles; my $msg = places 1, ($scan->volume / $num_articles); my $hdr = places 1, ($scan->header_volume / $num_articles); my $hdr_lines = places 1, ($scan->header_lines / $num_articles); my $body = places 1, ($scan->body_volume / $num_articles); my $body_lines = places 1, ($scan->body_lines / $num_articles); my $orig = places 1, ($scan->orig_volume / $num_articles); my $orig_lines = places 1, ($scan->orig_lines / $num_articles); my $sig = places 1, ($scan->sig_volume / $num_articles); my $sig_lines = places 1, ($scan->sig_lines / $num_articles); print <posters; ## by posts print <[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->articles ] } values %$posters )[0 .. 9]; $top_total = 0; for (@top) { my $vol = sprintf "%5.1f (%5.1f/%5.1f/%5.1f)", $_->volume / 1024, $_->header_volume / 1024, $_->body_volume / 1024, $_->orig_volume / 1024; printf "%5d %26s %s\n", $_->articles, $vol, $_->attrib; $top_total += $_->articles; } printf "\nDiese Poster zeichnen verantwortlich fuer %.1f%% aller Artikel.\n", 100 * $top_total / $scan->articles; ## by volume print <[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->volume ] } values %$posters )[0 .. 9]; $top_total = 0; for (@top) { my $vol = sprintf "%5.1f (%5.1f/%5.1f/%5.1f)", $_->volume / 1024, $_->header_volume / 1024, $_->body_volume / 1024, $_->orig_volume / 1024; printf "%26s %5d %s\n", $vol, $_->articles, $_->attrib; $top_total += $_->volume; } printf "\nDiese Poster zeichnen verantwortlich fuer %.1f%% des gesamten Volumens.\n", 100 * $top_total / $scan->volume; ## top OCR print <[1] <=> $a->[1] } map { [ $_, ($_->orig_volume / $_->body_volume) ] } grep { $_->articles >= 5 } values %$posters )[0 .. 9]; for (@top) { printf "%.3f (%5.1f /%5.1f) %5d %s\n", $_->[1], $_->[0]->orig_volume / 1024, $_->[0]->body_volume / 1024, $_->[0]->articles, $_->[0]->attrib; } ## bottom OCR print <[1] <=> $b->[1] } map { [ $_, ($_->orig_volume / $_->body_volume) ] } grep { $_->articles >= 5 } values %$posters )[0 .. 9]; for (reverse @top) { printf "%.3f (%5.1f /%5.1f) %5d %s\n", $_->[1], $_->[0]->orig_volume / 1024, $_->[0]->body_volume / 1024, $_->[0]->articles, $_->[0]->attrib; } print "\n"; } sub top_threads { my $scan = shift; my @top; my $threads = $scan->threads; local $_; ## by posts print <[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->articles ] } values %$threads )[0 .. 9]; for (@top) { printf "%5d %s\n", $_->articles, $_->subject; } ## by volume print <[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->volume ] } values %$threads )[0 .. 9]; for (@top) { my $vol = sprintf "%5.1f (%5.1f/%5.1f/%5.1f)", $_->volume / 1024, $_->header_volume / 1024, $_->body_volume / 1024, $_->orig_volume / 1024; printf "%26s %5d %s\n", $vol, $_->articles, $_->subject; } ## top OCR print <[1] <=> $a->[1] } map { [ $_, ($_->orig_volume / $_->body_volume) ] } grep { $_->articles >= 3 } values %$threads )[0 .. 9]; for (@top) { printf "%.3f (%5.1f/ %5.1f) %5d %s\n", $_->[1], $_->[0]->orig_volume / 1024, $_->[0]->body_volume / 1024, $_->[0]->articles, $_->[0]->subject; } ## bottom OCR print <[1] <=> $b->[1] } map { [ $_, ($_->orig_volume / $_->body_volume) ] } grep { $_->articles >= 3 } values %$threads )[0 .. 9]; for (reverse @top) { printf "%.3f (%5.1f /%5.1f) %5d %s\n", $_->[1], $_->[0]->orig_volume / 1024, $_->[0]->body_volume / 1024, $_->[0]->articles, $_->[0]->subject; } print "\n"; } sub top_xposts { my $scan = shift; my @top; my $xposts = $scan->crossposts || return; my $posters = $scan->posters; local $_; print <[1] <=> $a->[1] } map { [ $_, $xposts->{$_} ] } keys %$xposts )[0 .. 9]; for (@top) { next unless defined $_; printf "%8d %s\n", $_->[1], $_->[0]; } print <[1] <=> $a->[1] } map { [ $_, $_->crossposts ] } values %$posters )[0 .. 9]; for (@top) { next unless defined $_; printf "%8d %s\n", $_->[1], $_->[0]->attrib; } } sub print_stats { my $scan = shift; excluded $scan; totals $scan; avgs $scan; top_posters $scan; top_threads $scan; top_xposts $scan; } ## main my $dump = shift || die "Usage: $0 \n"; my $scan; { my $VAR1; ## from the Data::Dumper output open DUMP, $dump or die "$0: failed open $dump: $!\n"; local $/ = undef; my $data = ; $scan = eval $data; die "$0: Error evaluating dumpfile: $@\n" if $@; close DUMP; } print_header $scan; print_stats $scan; News-Scan-0.53/eg/README0100644006374700003100000000032007621717454014161 0ustar gbaconPremiumHere's my usual sequence of commands for clpmisc: % clpm-get % ./news-scan >2003-02-10.scan % news-stats 2003-02-10.scan >2003-02-10.scan % cd ~/spool % ./empty-old 2 comp.lang.perl.misc News-Scan-0.53/eg/empty-old0100755006374700003100000000241407621442037015137 0ustar gbaconPremium#! /usr/local/bin/perl -w use strict; use Date::Parse; sub usage { "Usage: $0 age [dir]\n" . "where:\n" . " - age is the age in days\n" . " - dir is the directory to scan (. by default)\n" } die usage unless @ARGV == 2 || @ARGV == 3; my $age = shift; my $dir = shift; unless ($age =~ /^\d+$/) { die "$0: age must be an integer\n"; } $dir = '.' unless defined $dir; opendir DIR, $dir or die "$0: opendir $dir: $!\n"; open SEEN, ">>$dir/.seen" or die "$0: open >>$dir/.seen: $!\n"; $/ = ""; my $file; my $hdr; my $path; while ($file = readdir DIR) { next if $file eq ".seen"; $path = "$dir/$file"; next unless -f $path; if (-s _ == 0) { print SEEN "$file\n" or warn "$0: print SEEN $file: $!\n"; unlink $path or warn "$0: unlink $path: $!\n"; next; } unless (open ART, $path) { warn "$0: open $path: $!\n"; next; } $hdr = ; unless ($hdr =~ /^Date:\s+(.+)$/mi) { warn "$0: $path: weird header: `$hdr'\n"; next; } close ART; my $time = str2time($1) || 0; if ($^T - $time >= $age * 86400) { print SEEN "$file\n" or warn "$0: print SEEN $file: $!\n"; unlink $path or warn "$0: unlink $path: $!\n"; } } News-Scan-0.53/MANIFEST0100644006374700003100000000037307621720144014035 0ustar gbaconPremiumChanges MANIFEST Makefile.PL News/Scan.pm News/Scan/Article.pm News/Scan/Poster.pm News/Scan/Thread.pm README TODO eg/README eg/avg-addy eg/empty-old eg/new-posters eg/news-scan eg/news-stats eg/news-stats.deutsch eg/nntpget eg/update-posters t/use.t News-Scan-0.53/News/0040755006374700003100000000000007621721716013626 5ustar gbaconPremiumNews-Scan-0.53/News/Scan/0040755006374700003100000000000007621721716014512 5ustar gbaconPremiumNews-Scan-0.53/News/Scan/Article.pm0100644006374700003100000001532007144353366016433 0ustar gbaconPremiumpackage News::Scan::Article; use strict; use vars qw( $VERSION @ISA ); use Mail::Internet; use Mail::Address; use Date::Parse; $VERSION = '0.51'; @ISA = qw( Mail::Internet ); sub new { my $class = shift; my $group = pop; my $self = $class->SUPER::new(@_); bless $self, $class; $self->group($group); $self->calculate_sizes; if ($self->in_period($group->period)) { return $self; } else { return undef; } } sub in_period { my $self = shift; my $period = shift(@_) * 60 * 60 * 24; my $date = $self->head->get('Date'); return 0 unless (defined $date and $date); chomp $date; my $time = str2time $date; if ($time < ($^T - $period)) { return 0; } $self->group->earliest($time); $self->group->latest($time); 1; } sub group { my $self = shift; if (@_) { my $old = $self->{'news_scan_article_group'}; $self->{'news_scan_article_group'} = shift; return $old; } else { return $self->{'news_scan_article_group'}; } } sub calculate_sizes { my $self = shift; my $total = 0; my $line; ## header my $header_size = 0; foreach $line (@{ $self->head->header }) { $header_size += length $line; $self->{'news_scan_article_header_lines'}++; } $total += $header_size; $self->{'news_scan_article_header_size'} = $header_size; ## add a byte for the separator $total++; ## signature (if present) my @body = @{ $self->body }; my $sig_start = 0; my $found_sig = 0; foreach $line (reverse @body) { $sig_start--; if ($line =~ /^-- $/) { $found_sig++; last; } } if ($found_sig) { my @signature = splice @body, $sig_start; shift @signature; ## toss cutline $self->{'news_scan_article_sig_lines'} = @signature; my $sig_size = 0; foreach $line (@signature) { $sig_size += length $line; } $self->{'news_scan_article_sig_size'} = $sig_size; $total += $sig_size; } else { $self->{'news_scan_article_sig_lines'} = 0; $self->{'news_scan_article_sig_size'} = 0; } ## body my $body_size = 0; foreach $line (@body) { $body_size += length $line; } $self->{'news_scan_article_body_size'} = $body_size; $self->{'news_scan_article_body_lines'} = @body; $total += $body_size; $self->{'news_scan_article_size'} = $total; ## original if (my $group = $self->group || 0) { my $quote_re = $group->quote_re; if ($quote_re) { my @orig = grep { ! /$quote_re/o } @body; my $orig_size = 0; foreach $line (@orig) { $orig_size += length $line; } $self->{'news_scan_article_orig_size'} = $orig_size; $self->{'news_scan_article_orig_lines'} = @orig; } } else { $self->{'news_scan_article_orig_size'} = 0; $self->{'news_scan_article_orig_lines'} = 0; } } sub author { my $self = shift; my $hd = $self->head || return; my $from = $hd->get('Reply-To') || $hd->get('From') || $hd->get('Sender') || ""; chomp $from; my $addr = ( Mail::Address->parse($from) )[0]; if (exists $self->group->aliases->{lc $addr->address}) { ## XXX: Danger, Will Robinson! Broken Encapsulation Alert!!! $addr->[1] = $self->group->aliases->{lc $addr->address}; } unless (defined $addr and ref $addr) { return; } else { return $addr; } } sub message_id { my $self = shift; my $hdr = $self->head->get('Message-ID'); chomp $hdr; $hdr; } sub subject { my $self = shift; my $hdr = $self->head->get('Subject'); chomp $hdr; $hdr; } sub newsgroups { my $self = shift; my $hdr = $self->head->get('Newsgroups') || ''; $hdr =~ s/^\s+//; $hdr =~ s/\s+$//; split /\s*,+\s*/, $hdr; } sub size { $_[0]->{'news_scan_article_size'} } sub header_size { $_[0]->{'news_scan_article_header_size'} } sub body_size { $_[0]->{'news_scan_article_body_size'} } sub orig_size { $_[0]->{'news_scan_article_orig_size'} } sub sig_size { $_[0]->{'news_scan_article_sig_size'} } sub header_lines { $_[0]->{'news_scan_article_header_lines'} } sub body_lines { $_[0]->{'news_scan_article_body_lines'} } sub orig_lines { $_[0]->{'news_scan_article_orig_lines'} } sub sig_lines { $_[0]->{'news_scan_article_sig_lines'} } 1; __END__ =head1 NAME News::Scan::Article - collect information about news articles =head1 SYNOPSIS use News::Scan::Article; my $art = News::Scan::Article->new( ARG, [ OPTIONS, ] SCAN ); =head1 DESCRIPTION This module provides a derived class of C whose objects are suitable for digesting Usenet news articles. =head1 CONSTRUCTOR =over 4 =item new ( ARG, [ OPTIONS, ] SCAN-OBJ ) The C and C parameters are identical to those required by C, except C is required. See L. The C parameter should be a C object. See L. If the article falls into the period of interest for C, the object is returned, else C. =back =head1 METHODS =over 4 =item group ( [ SCAN-OBJ ] ) Sets or returns an object's group depending on whether C is present. =item author Returns the article's author represented as a C object. =item message_id Returns the article's Message-ID. =item subject Returns the article's subject. =item newsgroups Returns the list of newsgroups this article was posted to. =item size Returns the size of this article in bytes. =item header_size Returns the size of this article's header in bytes. =item header_lines Returns the number of lines consumed in this article by headers. =item body_size Returns the size of this article's body in bytes. =item body_lines Returns the number of lines consumed in this article by the body. =item orig_size Returns the size of this article's original content in bytes. See L. =item orig_lines Returns the number of lines consumed in this article by original content. Keep in mind that original content is a subset of the body. =item sig_size Returns the size of this article'ss signature in bytes. =item sig_lines Returns the number of lines consumed in this article by the signature. =back =head1 SEE ALSO L, L, L =head1 AUTHOR Greg Bacon =head1 COPYRIGHT Copyright (c) 1997 Greg Bacon. All Rights Reserved. This library is free software. You may distribute and/or modify it under the same terms as Perl itself. =cut News-Scan-0.53/News/Scan/Thread.pm0100644006374700003100000001075707075371704016267 0ustar gbaconPremium# News::Scan::Thread package News::Scan::Thread; use strict; use vars '$VERSION'; use Carp; use News::Scan::Article; $VERSION = '0.51'; sub new { my $class = shift; my $self = {}; my $art = shift; my $subj = shift; bless $self, $class; $self->subject($subj); $self->volume($art->size); $self->articles(1); $self->header_volume($art->header_size); $self->header_lines($art->header_lines); $self->body_volume($art->body_size); $self->body_lines($art->body_lines); $self->orig_volume($art->orig_size); $self->orig_lines($art->orig_lines); $self->sig_volume($art->sig_size); $self->sig_lines($art->sig_lines); $self; } sub subject { my $self = shift; if (@_) { $self->{'news_scan_thread_subject'} = shift; } else { return $self->{'news_scan_thread_subject'}; } } sub volume { my $self = shift; if (@_) { $self->{'news_scan_thread_volume'} = shift; } else { return $self->{'news_scan_thread_volume'}; } } sub articles { my $self = shift; if (@_) { $self->{'news_scan_thread_articles'} = shift; } else { return $self->{'news_scan_thread_articles'}; } } sub header_volume { my $self = shift; if (@_) { $self->{'news_scan_thread_header_volume'} = shift; } else { return $self->{'news_scan_thread_header_volume'}; } } sub header_lines { my $self = shift; if (@_) { $self->{'news_scan_thread_header_lines'} = shift; } else { return $self->{'news_scan_thread_header_lines'}; } } sub body_volume { my $self = shift; if (@_) { $self->{'news_scan_thread_body_volume'} = shift; } else { return $self->{'news_scan_thread_body_volume'}; } } sub body_lines { my $self = shift; if (@_) { $self->{'news_scan_thread_body_lines'} = shift; } else { return $self->{'news_scan_thread_body_lines'}; } } sub orig_volume { my $self = shift; if (@_) { $self->{'news_scan_thread_orig_volume'} = shift; } else { return $self->{'news_scan_thread_orig_volume'}; } } sub orig_lines { my $self = shift; if (@_) { $self->{'news_scan_thread_orig_lines'} = shift; } else { return $self->{'news_scan_thread_orig_lines'}; } } sub sig_volume { my $self = shift; if (@_) { $self->{'news_scan_thread_sig_volume'} = shift; } else { return $self->{'news_scan_thread_sig_volume'}; } } sub sig_lines { my $self = shift; if (@_) { $self->{'news_scan_thread_sig_lines'} = shift; } else { return $self->{'news_scan_thread_sig_lines'}; } } 1; __END__ =head1 NAME News::Scan::Thread - keep track of threads in a Usenet newsgroup =head1 SYNOPSIS use News::Scan::Thread; my $thr = News::Scan::Thread->new($news_scan_article_obj); =head1 DESCRIPTION This module provides a class whose objects can be used to keep track of threads of discussion in a Usenet newsgroup. =head1 CONSTRUCTOR =over 4 =item new ( ARTICLE ) C
should be a C object or an object of some class derived from C. C performs some initialization and returns a C. =back =head1 METHODS =over 4 =item subject Returns this thread's subject. =item volume Returns the volume in bytes generated in this thread. =item articles Returns the number of posts to this thread. =item header_volume Returns the volume in bytes of the headers in this thread's articles. =item header_lines Returns the number of header lines in this thread's articles. =item body_volume Returns the volume in bytes of the message bodies of this thread's articles. =item body_lines Returns the number of lines in this thread's message bodies. =item orig_volume Returns the volume in bytes of the original content of this thread's articles. =item orig_lines Returns the number of original lines in this thread's articles. =item sig_volume Returns the volume in bytes of the signatures of this thread's articles. =item sig_lines Returns the number of signature lines in this thread's articles. =back =head1 SEE ALSO L, L =head1 AUTHOR Greg Bacon =head1 COPYRIGHT Copyright (c) 1997 Greg Bacon. All Rights Reserved. This library is free software. You may distribute and/or modify it under the same terms as Perl itself. =cut News-Scan-0.53/News/Scan/Poster.pm0100644006374700003100000001633607075371703016332 0ustar gbaconPremium# News::Scan::Poster package News::Scan::Poster; use strict; use vars '$VERSION'; use Carp; $VERSION = '0.51'; sub new { my $class = shift; my $self = {}; my $art; croak "usage: ${class}->new(ARTICLE-OBJ)" unless @_ == 1; $art = shift; $self->{'news_scan_poster_posted_to'} = {}; $self->{'news_scan_poster_message_ids'} = []; bless $self, $class; $self->address($art->author); $self->attrib($art->author->format); $self->volume($art->size); $self->articles(1); $self->message_ids($art->message_id); $self->posted_to($art); $self->header_volume($art->header_size); $self->header_lines($art->header_lines); $self->body_volume($art->body_size); $self->body_lines($art->body_lines); $self->orig_volume($art->orig_size); $self->orig_lines($art->orig_lines); $self->sig_volume($art->sig_size); $self->sig_lines($art->sig_lines); $self; } sub address { my $self = shift; if (@_) { $self->{'news_scan_poster_address'} = shift; } else { return $self->{'news_scan_poster_address'}; } } sub attrib { my $self = shift; return $self->{'news_scan_poster_attrib'} if $self->{'news_scan_poster_attrib'}; my $addr = $self->{'news_scan_poster_address'}; return unless $addr; my $phrase = $addr->phrase || ''; my $address = $addr->address || ''; my $comment = $addr->comment || ''; my $attrib = ''; for ($phrase, $address, $comment) { s/^\s+//; s/\s+$//; } if ($phrase) { if ($comment) { # expect $comment surrounded by () $attrib = "$phrase $comment"; } else { $attrib = $phrase; } } else { $attrib = $comment; $attrib =~ s/^\(//; $attrib =~ s/\)$//; } if ($attrib) { $attrib .= " <$address>"; } else { $attrib = $address; } $self->{'news_scan_poster_attrib'} = $attrib; } sub message_ids { my $self = shift; if (@_) { push @{$self->{'news_scan_poster_message_ids'}}, shift; } else { return @{$self->{'news_scan_poster_message_ids'}}; } } sub volume { my $self = shift; if (@_) { $self->{'news_scan_poster_volume'} = shift; } else { return $self->{'news_scan_poster_volume'}; } } sub articles { my $self = shift; if (@_) { $self->{'news_scan_poster_articles'} = shift; } else { return $self->{'news_scan_poster_articles'}; } } sub posted_to { my $self = shift; if (@_) { my $art = shift; my %uniq; for ($art->newsgroups) { $uniq{$_}++; } delete $uniq{$art->group->name}; for (keys %uniq) { $self->{'news_scan_poster_posted_to'}{$_}++; } } else { return %{$self->{'news_scan_poster_posted_to'}}; } } sub crossposts { my $self = shift; my $total = 0; for (keys %{$self->{'news_scan_poster_posted_to'}}) { $total += $self->{'news_scan_poster_posted_to'}{$_}; } $total; } sub header_volume { my $self = shift; if (@_) { $self->{'news_scan_poster_header_volume'} = shift; } else { return $self->{'news_scan_poster_header_volume'}; } } sub header_lines { my $self = shift; if (@_) { $self->{'news_scan_poster_header_lines'} = shift; } else { return $self->{'news_scan_poster_header_lines'}; } } sub body_volume { my $self = shift; if (@_) { $self->{'news_scan_poster_body_volume'} = shift; } else { return $self->{'news_scan_poster_body_volume'}; } } sub body_lines { my $self = shift; if (@_) { $self->{'news_scan_poster_body_lines'} = shift; } else { return $self->{'news_scan_poster_body_lines'}; } } sub orig_volume { my $self = shift; if (@_) { $self->{'news_scan_poster_orig_volume'} = shift; } else { return $self->{'news_scan_poster_orig_volume'}; } } sub orig_lines { my $self = shift; if (@_) { $self->{'news_scan_poster_orig_lines'} = shift; } else { return $self->{'news_scan_poster_orig_lines'}; } } sub sig_volume { my $self = shift; if (@_) { $self->{'news_scan_poster_sig_volume'} = shift; } else { return $self->{'news_scan_poster_sig_volume'}; } } sub sig_lines { my $self = shift; if (@_) { $self->{'news_scan_poster_sig_lines'} = shift; } else { return $self->{'news_scan_poster_sig_lines'}; } } 1; __END__ =head1 NAME News::Scan::Poster - keep track of posters to a newsgroup =head1 SYNOPSIS use News::Scan::Poster; my $poster = News::Scan::Poster->new($news_scan_article_obj); =head1 DESCRIPTION This module provides a class whose objects can be used to keep track of cumulative statistics for posters to a Usenet newsgroup such as header volume or signature lines. =head1 CONSTRUCTOR =over 4 =item new ( ARTICLE ) C
should be a C object or inherit from the C class. C performs some initialization and returns a C object. =back =head1 METHODS =over 4 =item address ( [ ADDRESS ] ) Returns the address of this poster represented as a C object. If present, C
tells the object that the C object in C
is its address. idea. =item attrib ( [ ATTRIBUTION ] ) Returns some nice attribution for this poster. If present, C tells the object how it shall identify itself when asked. =item message_ids ( [ MESSAGE-ID ] ) Returns a list of Message-IDs attributed to this poster. If present, C is added to this list of this poster's articles. =item volume Returns the volume in bytes of the traffic generated by this poster. =item articles Returns the number of articles attributed to this poster. =item posted_to Returns a hash whose keys are newsgroup names and whose values are the number of times this poster has crossposted to the group of interest and the corresponding newsgroup. =item crossposts Returns the total number of crossposts this poster has sent through the group of interest. =item header_volume Returns the volume in bytes generated by this poster's headers. =item header_lines Returns the number of header lines generated by this poster. =item body_volume Returns the volume in bytes generated by this poster's message bodies. =item body_lines Returns the number of body lines generated by this poster. =item orig_volume Returns the volume in bytes of original content generated by this poster. =item orig_lines Returns the number of original lines generated by this poster. =item sig_volume Returns the volume in bytes generated by this poster's signatures. =item sig_lines Returns the number of signature lines generated by this poster. =back =head1 SEE ALSO L, L, L =head1 AUTHOR Greg Bacon =head1 COPYRIGHT Copyright (c) 1997 Greg Bacon. All Rights Reserved. This library is free software. You may distribute and/or modify it under the same terms as Perl itself. =cut News-Scan-0.53/News/Scan.pm0100644006374700003100000005503707621720374015056 0ustar gbaconPremiumpackage News::Scan; use 5.004; use strict; use vars qw( $VERSION $AUTOLOAD ); use Carp; use IO::File; use IO::Seekable; ## get the seek constants use Mail::Address; use News::Scan::Article; use News::Scan::Poster; use News::Scan::Thread; $VERSION = '0.53'; ## play a fun little game here my $Have_Net_NNTP = 0; if (eval { require Net::NNTP }) { Net::NNTP->import; $Have_Net_NNTP++; } ## methods defined for our instances my %Permitted = ( name => undef, spool => undef, period => undef, aliases => undef, nntp_server => undef, nntp_auth_login => undef, nntp_auth_passwd => undef, nntp_client => undef, articles => undef, volume => undef, header_volume => undef, header_lines => undef, body_volume => undef, body_lines => undef, orig_volume => undef, orig_lines => undef, sig_volume => undef, sig_lines => undef, signatures => undef, ); sub new { my $class = shift; my $self = { news_scan_posters => {}, news_scan_xposts => {}, news_scan_threads => {}, news_scan_earliest => $^T, news_scan_latest => 0, news_scan_excludes => [], news_scan_aliases => {}, }; bless $self, $class; if (@_) { $self->configure(@_); if ($self->error) { return $self; } } $self->init; $self; } sub AUTOLOAD { my $self = $_[0]; my $class = ref $self || croak "`$self' is not an object"; my $name = $AUTOLOAD; $name =~ s/^.*:://; unless (exists $Permitted{$name}) { croak "Can't access `$name' field in class `$class'"; } eval <error(0); if (\@_) { my \$old = \$self->{'news_scan_$name'}; \$self->{'news_scan_$name'} = shift; return \$old; } else { return \$self->{'news_scan_$name'}; } } EOSub goto &$name; } sub configure { my $self = shift; my %arg = @_; $self->error(0); if (exists $arg{From}) { $self->from(delete $arg{From}); return undef if $self->error; } if (exists $arg{Group}) { $self->name(delete $arg{Group}); } if (exists $arg{Spool}) { $self->spool(delete $arg{Spool}); } if (exists $arg{NNTPServer}) { $self->nntp_server(delete $arg{NNTPServer}); } if (exists $arg{NNTPAuthLogin}) { $self->nntp_auth_login(delete $arg{NNTPAuthLogin}); } if (exists $arg{NNTPAuthPasswd}) { $self->nntp_auth_passwd(delete $arg{NNTPAuthPasswd}); } if (exists $arg{Period}) { $self->period(delete $arg{Period}); } else { $self->period(7); } if (exists $arg{QuoteRE}) { $self->quote_re(delete $arg{QuoteRE}); return undef if $self->error; } else { $self->quote_re('^\s{0,3}(?:>|:|\S+>|\+\+)'); } if (exists $arg{Exclude}) { $self->exclude(delete $arg{Exclude}); return undef if $self->error; } if (exists $arg{Aliases}) { $self->aliases(delete $arg{Aliases}); } 1; } sub init { my $self = shift; $self->error(0); $self->articles(0); $self->volume(0); $self->header_volume(0); $self->header_lines(0); $self->body_volume(0); $self->body_lines(0); $self->orig_volume(0); $self->orig_lines(0); $self->sig_volume(0); $self->sig_lines(0); $self->signatures(0); } sub earliest { my $self = shift; if (@_) { my $try = shift; if ($try < $self->{'news_scan_earliest'}) { $self->{'news_scan_earliest'} = $try; return 1; ## indicate success } else { return 0; } } else { return $self->{'news_scan_earliest'}; } } sub latest { my $self = shift; if (@_) { my $try = shift; if ($try > $self->{'news_scan_latest'}) { $self->{'news_scan_latest'} = $try; return 1; ## indicate success } else { return 0; } } else { return $self->{'news_scan_latest'}; } } sub from { my $self = shift; $self->error(0); if (@_) { my $old = $self->{'news_scan_from'}; my $from = shift; if (lc($from) eq 'spool') { $self->{'news_scan_from'} = 'spool'; } elsif (lc($from) eq 'nntp') { unless ($Have_Net_NNTP) { croak <{'news_scan_from'} = 'nntp'; } else { return $self->error("Invalid news source: `$from'"); } return $old; } else { return $self->{'news_scan_from'}; } } sub quote_re { my $self = shift; if (@_) { my $old = $self->{'news_scan_quote_re'}; my $new = shift; unless (eval { local $SIG{'__DIE__'}; local $_ = ''; /$new/, 1 }) { $@ =~ s/^(.*) at.*$/$1/s; return $self->error($@); } $self->error(0); $self->{'news_scan_quote_re'} = $new; return $old; } else { return $self->{'news_scan_quote_re'}; } } sub exclude { my $self = shift; my $pariahs = shift; unless (defined $pariahs and ref $pariahs) { return $self->error("exclude takes a reference to an array"); } $self->{'news_scan_excludes'} = $pariahs; my $matcher = 'local $_ = shift;'; $matcher .= 'study;' if @$pariahs >= 5; local $_; for (@$pariahs) { unless (eval { local $SIG{'__DIE__'}; /$_/i, 1 }) { $@ =~ s/^(.*) at.*$/$1/s; return $self->error("Bad pattern: $@\n"); } $matcher .= "return 1 if /$_/i;"; } $matcher .= 'return 0;'; $self->{'news_scan_excluded_sub'} = eval "sub { $matcher }"; return $self->error("Failed to generate exclusion: $@") if $@; $self->error(0); } sub excludes { \@{ $_[0]->{'news_scan_excludes'} } } sub excluded { my $self = shift; my $addr = shift; ## Mail::Address (or descendant) object $self->error(0); ## exclude empty addresses return 1 unless (defined $addr and ref $addr); my $decision = $self->{'news_scan_excluded_sub'}; unless (defined $decision and ref $decision) { return 0; } $decision->($addr->address); } sub nntp_connect { my $self = shift; $self->error(0); return if defined $self->nntp_client; my $client; my $nntp_host = ''; my $nntp_port = ''; my $server = $self->nntp_server || ''; if ($server) { ($nntp_host, $nntp_port) = split /:/, $server; } my @args = (); push @args, $nntp_host if $nntp_host; push @args, (Port => $nntp_port) if $nntp_port; $client = Net::NNTP->new(@args); unless (defined $client) { return $self->error("Failed to create Net::NNTP object"); } my $login = $self->nntp_auth_login || ''; my $passwd = $self->nntp_auth_passwd || ''; if ($login and $passwd) { unless ($client->authinfo($login, $passwd)) { return $self->error("Authinfo failed"); } } $self->nntp_client($client); 1; } sub _next_nntp_article { my $self = shift; my $client; $client = $self->nntp_client; unless (defined $client) { unless ($self->nntp_connect) { return $self->error("Failed to establish NNTP connection: " . $self->error); } $client = $self->nntp_client; unless ($client->group($self->name)) { return $self->error("Invalid group name: `" . $self->name . "'"); } $self->{'news_scan_article_list'} = $client->listgroup; } $self->error(0); # retry if we need to skip cancelled articles while (@{$self->{'news_scan_article_list'}}) { my $article = shift @{$self->{'news_scan_article_list'}}; my $fh = IO::File->new_tmpfile; unless (defined $fh) { return $self->error("Could not open temporary file: $!\n"); } my $lines = $client->article($article); next unless ref $lines; print $fh @$lines; $fh->seek(0, SEEK_SET); return $fh; } } sub _next_spool_article { my $self = shift; my $spool = $self->spool; unless (defined $self->{'news_scan_article_list'}) { unless (defined $spool) { return $self->error("News spool directory unknown"); } unless (opendir DIR, $spool) { return $self->error("Failed opendir $spool: $!"); } $self->{'news_scan_article_list'} = [ grep { -f "$spool/$_" && -s _ } readdir DIR ]; } $self->error(0); my $article = shift @{ $self->{'news_scan_article_list'} }; return undef unless defined $article; my $fh = new IO::File "< $spool/$article"; unless (defined $fh) { return $self->error("Failed open $spool/$article: $!"); } $fh; } sub next_article { my $self = shift; my $how = $self->from; unless (defined $how) { return $self->error("No news retrieval method specified!"); } $self->error(0); if ($how eq 'nntp') { return $self->_next_nntp_article; } elsif ($how eq 'spool') { return $self->_next_spool_article; } else { return $self->error("Unknown news source `$how'"); } } sub scan { my $self = shift; my $from; my $fh; my $article; unless (defined $self->name) { return $self->error("$self has no idea what its name is"); } while ($fh = $self->next_article) { $article = News::Scan::Article->new($fh, Modify => 0, $self); if (defined $article and not $self->excluded($article->author)) { $self->add_article($article); } } if ($self->error) { return undef; } $self->error(0); 1; } sub collect { my $self = shift; my $group; my $spool; $group = $self->name; unless (defined $group) { return $self->error("$self has no idea what group it is"); } $spool = $self->spool; unless (defined $spool) { return $self->error("$self does not know where its spool is"); } unless (-d $spool and -w _) { return $self->error("`$spool' not a directory or writable"); } unless ($self->nntp_connect) { return $self->error("Failed to create Net::NNTP object: " . $self->error); } my $client = $self->nntp_client; unless ($client->group($group)) { return $self->error("Invalid group name: `$group'"); } local $_; my %seen; if (open SEEN, "$spool/.seen") { while () { chomp; $seen{$_} = 1; } close SEEN; } for (grep { !-f "$spool/$_" && !$seen{$_} } @{ $client->listgroup }) { my $art = $client->article($_); unless ($art) { my $msg = $client->message; warn "$0: $group:$_: $msg\n"; next; } unless (open ART, ">$spool/$_") { return $self->error("Failed to save article"); } print ART @$art; close ART; } $self->error(0); 1; } sub error { my $self = shift; if (@_) { $self->{'news_scan_error'} = shift; return undef; } else { return $self->{'news_scan_error'}; } } sub add_article { my $self = shift; my $article = shift; return unless (defined $article and ref $article); $self->articles($self->articles + 1); $self->volume($self->volume + $article->size); $self->header_volume($self->header_volume + $article->header_size); $self->header_lines($self->header_lines + $article->header_lines); $self->body_volume($self->body_volume + $article->body_size); $self->body_lines($self->body_lines + $article->body_lines); $self->orig_volume($self->orig_volume + $article->orig_size); $self->orig_lines($self->orig_lines + $article->orig_lines); if (my $sig_size = $article->sig_size) { $self->signatures($self->signatures + 1); $self->sig_volume($self->sig_volume + $sig_size); $self->sig_lines($self->sig_lines + $article->sig_lines); } $self->add_poster($article); $self->add_crossposts($article); $self->add_to_thread($article); $self->error(0); } ## poster bookkeeping stuff sub add_poster { my $self = shift; my $art = shift; $self->error(0); my $posters = $self->{'news_scan_posters'}; my $poster; if (exists $posters->{lc $art->author->address}) { $poster = $posters->{lc $art->author->address}; } else { $posters->{lc $art->author->address} = new News::Scan::Poster $art; return; } $poster->message_ids($art->message_id); $poster->volume($poster->volume + $art->size); $poster->articles($poster->articles + 1); $poster->posted_to($art); $poster->header_volume($poster->header_volume + $art->header_size); $poster->header_lines($poster->header_lines + $art->header_lines); $poster->body_volume($poster->body_volume + $art->body_size); $poster->body_lines($poster->body_lines + $art->body_lines); $poster->orig_volume($poster->orig_volume + $art->orig_size); $poster->orig_lines($poster->orig_lines + $art->orig_lines); $poster->sig_volume($poster->sig_volume + $art->sig_size); $poster->sig_lines($poster->sig_lines + $art->sig_lines); } sub posters { my %posters = %{ $_[0]->{'news_scan_posters'} }; \%posters; } ## crossposts bookkeeping stuff sub add_crossposts { my $self = shift; my $art = shift; my %uniq; local $_; for ($art->newsgroups) { $uniq{lc $_}++; } delete $uniq{lc $self->name}; for (keys %uniq) { $self->{'news_scan_xposts'}{$_}++; } $self->error(0); } sub crossposts { my %xposts = %{ $_[0]->{'news_scan_xposts'} }; \%xposts; } ## thread bookkeeping stuff sub add_to_thread { my $self = shift; my $art = shift; my $threads = $self->{'news_scan_threads'}; my $thread; $self->error(0); ## find the subject my $subj = $art->subject; $subj =~ s/ ^Re ## leading Re (?:\[.*?\])? ## possible nonstandard crap :\s* ## trailing : and optional whitespace //ix; if (exists $threads->{$subj}) { $thread = $threads->{$subj}; } else { $threads->{$subj} = new News::Scan::Thread $art, $subj; return; } $thread->volume($thread->volume + $art->size); $thread->articles($thread->articles + 1); $thread->header_volume($thread->header_volume + $art->header_size); $thread->header_lines($thread->header_lines + $art->header_lines); $thread->body_volume($thread->body_volume + $art->body_size); $thread->body_lines($thread->body_lines + $art->body_lines); $thread->orig_volume($thread->orig_volume + $art->orig_size); $thread->orig_lines($thread->orig_lines + $art->orig_lines); $thread->sig_volume($thread->sig_volume + $art->sig_size); $thread->sig_lines($thread->sig_lines + $art->sig_lines); } sub threads { my %threads = %{ $_[0]->{'news_scan_threads'} }; \%threads; } sub DESTROY {} 1; __END__ =head1 NAME News::Scan - gather and report Usenet newsgroup statistics =head1 SYNOPSIS use News::Scan; my $scan = News::Scan->new; =head1 DESCRIPTION This module provides a class whose objects can be used to gather and report Usenet newsgroup statistics. =head1 CONSTRUCTOR =item new ( [ OPTIONS ] ) C is a list of named parameters (i.e. given in key-value pairs). Valid options are =over 4 =item B The value of this option is the name of the newsgroup you wish to scan. =item B The value of this option should be either C<'spool'> or C<'NNTP'> (case is not significant). Any other value will produce an error (see the C method description below). A value of C<'spool'> indicates that you would like to scan articles in a spool (see the B option below). A value of C<'NNTP'> indicates that articles should be retrieved from your NNTP server (see the B option below). =item B The value of this option should be the path to the spool directory that contains the articles you would like to scan. This option is ignored unless the value of B is C<'spool'>. =item B The value of this option (in the form I:I, with both being optional--see L for the semantics of omitting one or both of these parameters) indicates the NNTP server from which to retrieve articles. This option is ignored unless B is C<'NNTP'>. See the description of the B and B options below. =item B The value of this option should be a valid NNTP authentication login for your NNTP server. This option is only necessary if your NNTP server requires authentication. =item B The value of this option should be the password corresponding to the login in B. Having this hardcoded in a script is evil, and there should be a much better way. =item B The value of this option indicates the length of the period (in days) immediately prior to invocation of the program from which you would like to scan articles. The default period is seven (7) days. =item B The value of this option is a Perl regular expression that accepts quoted lines and rejects unquoted or original lines. The default regular expression is C<^\s{0,3}(?:>|:|\S+>|\+\+)>. =item B The value of this option should be a reference to an array containing regular expressions that accept email addresses of posters whose articles you wish to ignore. =item B The value of this option should be a reference to a hash whose keys are email addresses that should be transformed into the email addresses that are their corresponding values, i.e. C 'real@address'>. =back =head1 METHODS =over 4 =item configure ( [ OPTIONS ] ) C is a list of named parameters identical to those accepted by C. Re-C-ing an object after scanning is probably a bad idea. This method returns C if it encounters an error. =back The following methods are the actual underlying methods used to set and retrieve the configuration options of the same name (modulo case): =over 4 =item name ( [ NEWSGROUP-NAME ] ) =item spool ( [ SPOOL-DIRECTORY ] ) =item period ( [ INTERVAL-LENGTH ] ) =item aliases ( [ ALIASES-HASHREF ] ) =item from ( C<'NNTP'> | C<'spool'> ) =item quote_re ( [ QUOTE-REGEX-ARRAYREF ] ) =item exclude ( [ EXCLUSION-REGEX-ARRAYREF ] ) =item nntp_server ( [ [ NNTP-SERVER ]:[ NNTP-PORT ] ] ) =item nntp_auth_login ( [ LOGIN ] ) =item nntp_auth_passwd ( [ PASSWORD ] ) =back These methods can be used to retrieve information from the C object or ask it to perform some action. =over 4 =item error ( [ MESSAGE ] ) Use this method to determine whether an object has encountered an error condition. The return value of C is guaranteed to be C<0> after any method completes successfully (except C). (Keep in mind that this will also overwrite any previous error message.) If there has been an error, this method should return some useful message. If provided, C sets the object's error message. =item articles Returns the number of articles accounted for. =item volume Returns the volume of traffic (in bytes) to the newsgroup in the period. =item header_volume Returns the volume (in bytes) generated by headers. =item header_lines Returns the number of lines consumed by headers. =item body_volume Returns the volume (in bytes) generated by message bodies. =item body_lines Returns the number of lines consumed by message bodies. =item orig_volume Returns the volume (in bytes) of text which has been determined to be original (see B). Note that original traffic is a subset of body traffic. =item orig_lines Returns the number of lines that are determined to be original. =item signatures Returns the number of messages that had a cutline (/^-- $/). =item sig_volume Returns the volume (in bytes) generated by signatures. =item sig_lines Returns the number of lines consumed by signatures. =item earliest ( [ TIME ] ) Use this method to determine the date (in seconds since the Epoch) that the oldest article found within the period was posted to Usenet. If C