XML-RSS-SimpleGen-11.11/ 0000755 0001773 0001773 00000000000 10011101712 015305 5 ustar sburke sburke 0000000 0000000 XML-RSS-SimpleGen-11.11/t/ 0000755 0001773 0001773 00000000000 10011101712 015550 5 ustar sburke sburke 0000000 0000000 XML-RSS-SimpleGen-11.11/t/40_item.t 0000644 0001773 0001773 00000001020 10002402572 017177 0 ustar sburke sburke 0000000 0000000
require 5;
use strict;
use Test;
BEGIN { plan tests => 4 }
print "# Starting ", __FILE__ , " ...\n";
ok 1;
use XML::RSS::SimpleGen;
rss_new( 'http://blar.int' );
rss_item("http://blar.int#a", "About A");
rss_item("http://blar.int#b", "About B");
rss_item("http://blar.int#c", "About C");
ok rss_item_count(), 3;
rss_item("http://blar.int#b", "About B");
my $string = rss_as_string();
my $count = 0;
while( $string =~ m/(\#\w)\b/g ) {
print "# Good, found \"$1\"\n";
++$count;
}
ok $count, 3;
print "# bye\n";
ok 1;
XML-RSS-SimpleGen-11.11/t/30_simple_xml.t 0000644 0001773 0001773 00000002423 10002400522 020412 0 ustar sburke sburke 0000000 0000000
require 5;
use strict;
use Test;
BEGIN { plan tests => 16 }
print "# Starting ", __FILE__ , " ...\n";
ok 1;
#sub XML::RSS::SimpleGen::DEBUG () {20}
use XML::RSS::SimpleGen ();
sub r ($$) {
my($m,$v) = @_;
my $r = XML::RSS::SimpleGen->new('http://test.int/','blorg');
$r->$m($v);
$r->as_string;
}
ok r('ttl', '30'), '/30/';
ok r('skipHours', 1), '/\s*1\s*/';
ok r('skipDays' , 1), '/\s*Monday\s*/';
ok r('skipDays' , 'Monday'), '/\s*Monday\s*/';
ok r('skipDays' , 'Monday'), '/\s*Monday\s*/';
ok r('language' , 'sgn-us'), '/sgn-us/';
ok r('css', './foo.css'), '/foo\.css/';
ok r('xsl', './foo.xsl'), '/foo\.xsl/';
ok r('webMaster', 'jojo@mojo.int'), '/jojo@mojo.int/';
ok r('docs', 'http://whatever.int'), '/http://whatever\.int/';
ok r('url', 'http://whatever.int'), '/http://whatever.int/';
ok r('title', 'jojo@mojo.int'), '/
jojo@mojo.int/';
ok r('description', 'jojo@mojo.int'), '/jojo@mojo.int/';
ok r('item', 'http://whatever.int'), '/http://whatever.int/';
print "# Done at ", scalar(localtime), ".\n";
ok 1;
XML-RSS-SimpleGen-11.11/t/00_about.t 0000644 0001773 0001773 00000001433 10002077154 017363 0 ustar sburke sburke 0000000 0000000
require 5;
use Test;
BEGIN { plan tests => 1; }
use XML::RSS::SimpleGen;
eval "require LWP::Simple; require LWP";
print "#\n#\n",
"# XML::RSS::SimpleGen v$XML::RSS::SimpleGen::VERSION\n",
"# LWP::Simple v", $LWP::Simple::VERSION || "?", "\n",
"# LWP v", $LWP::VERSION || "?", "\n",
"#\n#\n",
;
print "# Running under perl version $] for $^O",
(chr(65) eq 'A') ? "\n" : " in a non-ASCII world\n";
print "# Win32::BuildNumber ", &Win32::BuildNumber(), "\n"
if defined(&Win32::BuildNumber) and defined &Win32::BuildNumber();
print "# MacPerl verison $MacPerl::Version\n"
if defined $MacPerl::Version;
printf
"# Current time local: %s\n# Current time GMT: %s\n",
scalar( gmtime($^T)), scalar(localtime($^T));
print "# Using Test.pm v", $Test::VERSION || "?", "\n";
ok 1;
XML-RSS-SimpleGen-11.11/t/10_html2xmltext.t 0000644 0001773 0001773 00000004436 10007117624 020736 0 ustar sburke sburke 0000000 0000000
require 5;
use strict;
use Test;
BEGIN { plan tests => 32 }
print "# Starting ", __FILE__ , " ...\n";
ok 1;
#sub XML::RSS::SimpleGen::DEBUG () {20}
use XML::RSS::SimpleGen;
print "# XML::RSS::SimpleGen version $XML::RSS::SimpleGen::VERSION\n";
sub z ($) { return XML::RSS::SimpleGen->html2text( $_[0] ) }
ok z '' , '' , 'nullstring';
ok z '1' , '1' , 'digit 1';
ok z ' 1 ' , '1' , 'digit 1 with spaces';
ok z ' 1 ', '1' , 'digit 1 with spaces';
ok z "vis-à-vis Beyoncé's naïve papier-mâché résumé",
"vis-à-vis Beyoncé's naïve papier-mâché résumé";
ok z "vis-à-vis Beyoncé's naïve papier-mâché résumé",
"vis-à-vis Beyoncé's naïve papier-mâché résumé";
ok z "上德不德,",
"上德不德,";
ok z "上德不 德 ,",
"上德不 德 ,";
ok z "上丸,",
"上丸,";
ok z "上丸,",
"上丸,";
ok z "上丸,",
"上丸,";
print "# De-Winification test:\n";
ok z "€20 ‘could’ be “fun” - No-body",
'€20 ‘could’ be “fun” - No-body';
ok z "pyogo!", "p’yogo!";
ok z "pyogo!", "p’yogo!";
ok z "pyogo!", "p’yogo!";
ok z "pyogo!", "p’yogo!";
print "# Tag tests...\n";
ok z "N < 17 => true",
"N < 17 => true";
ok z "Realtime",
"Realtime";
ok z "Real
time",
"Real time";
ok z "Real
time",
"Real time";
ok z "Real time",
"Real time";
ok z "Realtime",
"Real time";
ok z "Real
time",
"Real time";
ok z "Realtime",
"Realtime";
ok z "Realtime",
"Realtime";
ok z "Realtime",
"Realtime";
ok z "Realtime",
"Realtime";
ok z "Realtime",
"Realtime";
ok z "Real\n\n\n\n\n \t\t time",
"Real time";
ok z "Realtime",
"Realtime";
print "# Quitting...\n";
ok 1;
XML-RSS-SimpleGen-11.11/t/20_http.t 0000644 0001773 0001773 00000001572 10002140720 017224 0 ustar sburke sburke 0000000 0000000
require 5;
use strict;
use Test;
BEGIN { plan tests => 8 }
print "# Starting ", __FILE__ , " ...\n";
ok 1;
#sub XML::RSS::SimpleGen::DEBUG () {20}
use XML::RSS::SimpleGen;
sub g ($) {
print "# Test-getting $_[0] at ", scalar(localtime), "...\n";
return defined(eval { get_url $_[0]}), 1, "getting $_[0]";
}
&ok(g 'http://www.perl.com/');
&ok(g 'http://www.yahoo.com/');
&ok(g 'http://www.google.com/');
print "# Now trying with LWP...\n";
if( eval "require LWP::Simple; 1;" and $LWP::Simple::VERSION ) {
print "# Using LWP::Simple v$LWP::Simple::VERSION\n";
&ok(g 'http://www.perl.com/');
&ok(g 'http://www.yahoo.com/');
&ok(g 'http://www.google.com/');
} else {
skip "skipping because LWP not available", 1,1;
skip "skipping because LWP not available", 1,1;
skip "skipping because LWP not available", 1,1;
}
print "# Done at ", scalar(localtime), ".\n";
ok 1;
XML-RSS-SimpleGen-11.11/t/05_xmltext.t 0000644 0001773 0001773 00000001063 10002102654 017754 0 ustar sburke sburke 0000000 0000000
require 5;
use strict;
use Test;
BEGIN { plan tests => 4 }
print "# Starting ", __FILE__ , " ...\n";
ok 1;
#sub XML::RSS::SimpleGen::DEBUG () {20}
use XML::RSS::SimpleGen;
print "# XML::RSS::SimpleGen version $XML::RSS::SimpleGen::VERSION\n";
sub z ($) { return XML::RSS::SimpleGen->xmlesc( $_[0] ) }
ok z "vis-à-vis Beyoncé's naïve papier-mâché résumé",
"vis-à-vis Beyoncé's naïve papier-mâché résumé";
ok z "This & That, N < 10, N > 2",
"This & That, N < 10, N > 2";
print "# Quitting...\n";
ok 1;
XML-RSS-SimpleGen-11.11/t/50_history.t 0000644 0001773 0001773 00000001260 10002401700 017741 0 ustar sburke sburke 0000000 0000000
require 5;
use strict;
use Test;
BEGIN { plan tests => 6 }
print "# Starting ", __FILE__ , " ...\n";
ok 1;
#sub XML::RSS::SimpleGen::DEBUG () {20}
use XML::RSS::SimpleGen;
my $rss = 'test.rss';
my $history = 'test.hst';
my $last;
{
my @curr;
foreach my $to_add ( qw( morning noon night ) ) {
rss_new( 'http://blar.int' );
rss_history_file( $history );
push @curr, $to_add;
foreach my $c ( sort @curr ) { # yes, sort!!
rss_item("http://blar.int#$c", "About $c");
}
rss_save($rss);
$last = rss_as_string();
ok 1;
sleep 2;
}
}
unlink $rss, $history;
$last =~ s/\n/ /g;
ok $last, '/night.+noon.+morning/';
print "# bye\n";
ok 1;
XML-RSS-SimpleGen-11.11/t/80_basedate.t 0000644 0001773 0001773 00000000746 10002417546 020042 0 ustar sburke sburke 0000000 0000000
require 5;
use strict;
use Test;
BEGIN { plan tests => 9 }
print "# Starting ", __FILE__ , " ...\n";
ok 1;
use XML::RSS::SimpleGen;
rss_new( 'http://blar.int' );
sub r {defined eval { rss_updateBase($_[0]) }; }
# ok r('1997'); nevermind that case
ok r('1997-07');
ok r('1997-07-16');
ok r('1994-11-05T13:15:30Z');
ok r('1997-07-16T19:20+01:00');
ok r('1994-11-05T08:15:30-05:00');
ok r('1997-07-16T19:20:30+01:00');
ok r('1997-07-16T19:20:30.45+01:00');
print "# bye\n";
ok 1;
XML-RSS-SimpleGen-11.11/META.yml 0000644 0001773 0001773 00000000656 10011101711 016564 0 ustar sburke sburke 0000000 0000000 # http://module-build.sourceforge.net/META-spec.html
#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX#
name: XML-RSS-SimpleGen
version: 11.11
version_from: lib/XML/RSS/SimpleGen.pm
installdirs: site
requires:
Carp: 0
IO::Socket: 0
strict: 0
distribution_type: module
generated_by: ExtUtils::MakeMaker version 6.17
XML-RSS-SimpleGen-11.11/lib/ 0000755 0001773 0001773 00000000000 10011101712 016053 5 ustar sburke sburke 0000000 0000000 XML-RSS-SimpleGen-11.11/lib/XML/ 0000755 0001773 0001773 00000000000 10011101712 016513 5 ustar sburke sburke 0000000 0000000 XML-RSS-SimpleGen-11.11/lib/XML/RSS/ 0000755 0001773 0001773 00000000000 10011101712 017162 5 ustar sburke sburke 0000000 0000000 XML-RSS-SimpleGen-11.11/lib/XML/RSS/SimpleGen.pm 0000644 0001773 0001773 00000224463 10011101077 021422 0 ustar sburke sburke 0000000 0000000
require 5;
package XML::RSS::SimpleGen;
use strict;
use Carp ();
require Exporter;
use vars qw(
@EXPORT %EXPORT_TAGS @ISA $VERSION
%Entities %WinLameEntities %CommonEnts $MIME_Type
$DTD_url $DTD_pubid $Nativize_newlines $DWIM @Hidies $RSS_obj
$Sleepy %IsBlockMarkup $MaybeIndent $MaybeNL %PeriodAsSeconds
$NAMESPACE_SY $CHUNK_MINUTES %BadPorts
@Retry_delays $UserAgentString
);
$VERSION = '11.11';
BEGIN { *DEBUG = sub () {0} unless defined &DEBUG; } # set DEBUG level
@ISA = qw(Exporter);
#$DTD_url ||= 'http://my.netscape.com/publish/formats/rss-0.91.dtd';
#$DTD_pubid ||= '-//Netscape Communications//DTD RSS 0.91//EN';
$NAMESPACE_SY ||= 'http://purl.org/rss/1.0/modules/syndication/';
$CHUNK_MINUTES = 10;
$Nativize_newlines = 1 unless defined $Nativize_newlines;
$DWIM = 1 unless defined $DWIM;
$Sleepy =
$ENV{'MAILTO'} ? 4 # under crontab
: ($ENV{'TERM'} || $ENV{'REQUEST_METHOD'} || $ENV{'COMSPEC'} ) ? 0
# almost definitely not under crontab
: 4 unless defined $Sleepy;
@Retry_delays = (4, 10, 20, 40);
$MaybeIndent = ' ';
$MaybeNL = "\n";
#$MaybeNL = $MaybeIndent = ''; # terser, more grep-worthy
foreach my $p ( # ports we'll refuse to do HTTP on
qw<0 1 7 9 11 13 15 17 19 20 21 22 23 25 37 42 43 53 70 79 95 101 102 103
104 107 109 110 111 113 115 117 119 123 135 137 138 139 143 389 443 512
513 514 515 517 518 526 530 531 532 540 556 6667
>) { $BadPorts{$p} = 1 unless defined $BadPorts{$p}; }
$UserAgentString ||= "XmlRssSimpleGen/$VERSION";
#..........................................................................
sub _hide {push @Hidies, @_ };
_hide qw(init import);
#..........................................................................
=head1 NAME
XML::RSS::SimpleGen - for writing RSS files
=head1 SYNOPSIS
# A complete screen-scraper and RSS generator here:
use strict;
use XML::RSS::SimpleGen;
my $url = q;
rss_new( $url, "eXile", "Moscow-based Alternative Newspaper" );
rss_language( 'en' );
rss_webmaster( 'xxxxx@yourdomain.com' );
rss_twice_daily();
get_url( $url );
while(
m{
\s*(.*?);
rss_new( $url, "eXile" );
rss_language( 'en' );
get_url( $url );
...
does the same work as this OO code:
use XML::RSS::SimpleGen ();
my $url = q;
my $rss = XML::RSS::SimpleGen->new( $url, "eXile");
$rss->language( 'en' );
$rss->get_url( $url );
...
(Note that the function C doesn't have a leading "rss_",
so its method name is the same as its function name. It's the
one exception.)
If this talk of objects puzzles you, see
L in the C dist, and/or see
the chapter "User's View of Object-Oriented Modules"
in my book I (L).
(The book is also useful as an extended discussion of screen-scraping.)
Note: in the code below, I use the word "accessor" a lot, to refer
to a function or method that you can call two possible ways:
1) like C)> to set the "foo" attribute to the value I,
or 2) like C to return the value of the "foo" attribute.
=head1 FUNCTIONS
=over
=item C );>
=item C );>
=item C );>
=item I C<< $rss = XML::RSS::SimpleGen->new(...); >>
This function creates a new RSS feed in memory. This should be the first
C> function you call in your program. If you call it
again, it erases the current object (if any) and sets up a new one according
to whatever parameters you pass.
The parameters are the full URL, the title, and the description of the
site (or page) that you're providing an RSS feed of. The description is
optional, but you should provide at least a URL and title.
Examples:
rss_new( $url, "eXile", "Moscow-based Alternative Newspaper" );
rss_new( 'http://www.mybazouki.com/news/', "Bazouki News!" );
(As a method, XML::RSS::SimpleGen->new simply returns a new
RSS object.)
=cut
sub new {
my $class = shift;
$class = ref($class) || $class;
my $new = bless { 'items' => [] }, $class;
$new->init();
@_ and $new->url(shift);
@_ and $new->title(shift);
@_ and $new->description(shift);
$new->item_limit( 0 );
$new->retention( 32 * 24 * 60 * 60 ); # 32 days
(-e "rss.css") ? $new->css("./rss.css")
: $new->css("http://www.interglacial.com/rss/rss.css");
(-e "about_rss.html") ? $new->docs("./about_rss.html")
: $new->docs("http://www.interglacial.com/rss/about.html");
return $new;
}
sub init { return; } # override in subclass as necessary
#..........................................................................
__PACKAGE__->_accessorize(
qw(
title description url language css xsl webMaster docs
item_limit ttl
retention
allow_duplicates
image_title image_link image_url image_width image_height image_description
)
);
=item the accessor C)>
This declares what language this RSS feed is in. It must be
an RFC3066-style language tags like "en", or "en-US", or "zh-TW".
(See I for a list.)
If you don't set the feed's language, it defaults to "en", for generic English.
If you call this function without a parameter, it returns
the current value of the RSS feed's language. For example:
print "I'm making an RSS feed for ", rss_language(), "!\n";
The same is true for all the functions that I label as "accessors".
=item the accessor C)>
This sets the maximum number of items that this feed will show.
The default value is 0, meaning that there is no maximum.
If you set it to a positive number I, then the feed will show only
the first I items that you declare with C. (Or, if you set
C, then the newest I items that you declare
with C.)
If you set it to a negative number I<-N>, then the feed will show only
the last I items that you declare with C. (Or, if you set
C, then the oldest I items you declare with C,
which is unlikely to be useful!)
=item the accessor C)>
This declares what email address you, the RSS generator manager, can be
reached at. Example:
rss_webMaster( 'sburke@bazouki-news.int' );
=cut
#..........................................................................
=item C )>
This declares that you want this RSS feed to keep track of what items are
new, and to list them first when the RSS is emitted. To do this, the RSS
generator has to store information in a file, where it tracks its "history",
i.e., when was the first time it saw given URLs, and the most recent time
it saw given URLs.
Typical usage is:
rss_history_file( 'thisrssfeed.dat' );
You should call C I you make any calls to
C.
The history-file feature is meant for cases where your RSS-generator
program calls C on I link it sees, but only wants the
I links to appear in the RSS output. (This can be a good approach
if you're making an RSS feed of a page like
C where there's some new links (to the
recently added stories), but also links to some days-old stories, and
I links to some always-there things like "Archive Search" and
"Contact Us" pages.
Once you call rss_history_file, the specified file is read in. The
in-memory history (stored in the RSS object) is updated as you
call C. But the file isn't updated until you call rss_save.
(A do-what-I-mean side effect of calling C is that it
sets rss_item_limit to 25 if it is currently 0.)
(Incidentally, if you're using rss_history_file as part of a CGI that
emits RSS data, instead of a program that just saves to an RSS file,
then things will get complicated. You'll need to call an internal method
to explicitly commit the history file to disk, and you'll need a
semaphore file to avoid race conditions. Email me for full info.)
=cut
sub history_file {
my $self = shift;
return $self->{'history_file'} unless @_; # read accession
my $file = $_[0];
unless(defined $file and length $file and $file =~ m/\S/) {
DEBUG and print "Killing $self 's memory-history.\n";
# I don't know if this'd be actually useful for anything tho.
delete $self->{'history_file'};
delete $self->{'_first_seen' };
delete $self->{'_last_seen' };
return undef;
}
DEBUG and print "Considering $file as $self 's history.\n";
$self->{'history_file'} = $file;
$self->_read_history_file;
$self->item_limit( 25 ) if $DWIM and !$self->item_limit();
return $file;
}
#..........................................................................
=item C );>
=item C );>
=item C );>
This adds a new item to the current feed. You will need to specify the
URL to add (and it should be a valid-looking URL, starting with
"I", and not containing any spaces). You may also specify
the title, but it's optional. And finally, you can optionally specify a
description. (You can remember this because it starts with the essential
item first, and progresses toward the most optional.)
Leading and tailing whitespace is removed from whichever of I and I are defined values, and HTML is parsed out.
A simple usage:
rss_item(
"http://www.harpers.org/MostRecentWR.html",
"Harper's Magazine's Weekly Review"
);
Although in practice, a typical call won't have string constants, but
will instead be like the example in the L,
namely:
rss_item("$url$1", $2, $3);
Incidentally, as a do-what-I-mean feature, if the first parameter
doesn't look like a URL but one of the others does, then this error is
silently forgiven. This is so you can occasionally slip up and forget
the order of the parameters.
(In the unlikely event where you I to avoid the HTML-removal
features, you can do this by passing scalar-references instead of
normal strings, like so: C.)
=cut
sub item { # Add an item: (url, title, description)
my($self, @params) = @_;
if( grep defined($_) && length($_), @params ) {
push @{$self->{'items'}},
[
$self->_process_item_params(@params) # DWIM things happen here
]
;
DEBUG and print "Adding item ",
join("|", @{ $self->{'items'}[-1] }), "\n";
# Update history...
if( $self->{'_first_seen'} ) {
my $url = $self->{'items'}[-1][0];
my $now =
$self->{'_virgin_item_timestamp'}
? --$self->{'_virgin_item_timestamp'}
: time()
;
$self->{'_first_seen'}{$url} ||= $now;
$self->{'_last_seen' }{$url} = $now;
}
} else {
DEBUG and print "Not adding item -- empty params\n";
}
return $self;
}
#..........................................................................
=item C
This returns the number of items you've declared. I anticipate that its
main usage will be something like:
die "What, no objects found at $url ?!"
unless rss_item_count();
or, maybe...
exit unless rss_item_count();
...depending on how/whether you'd want to react to cases where you don't
see anything to put into an RSS feed.
Note that the parens are optional, since this command takes no options
(just like Perl's C