URI-Title-1.86/000755 000765 000024 00000000000 11760207337 013307 5ustar00tomistaff000000 000000 URI-Title-1.86/Changes000644 000765 000024 00000004537 11760207224 014606 0ustar00tomistaff000000 000000 2012-05-26 1.86 Tom Insam * Twitter suport fixed for non-javascript anchor links (yay) 2011-06-02 1.85 Tom Insam * Twitter suport fixed for javascript anchor links (ugh) 2009-07-27 1.84 Tom Insam * Spotify titling support from https://rt.cpan.org/Public/Bug/Display.html?id=48231 2008-10-18 1.83 Paul Mison * Work around a change in Twitter HTML 2008-09-30 1.82 Tom Insam * Latest LWP-UserAgent is pickier about what you pass it as a URI - fixed. 2008-06-19 1.81 Tom Insam * fix changes for the Register title * clearer licensing 2008-06-19 1.80 Tom Insam * Twitter support * Google calculator support * Too many irritating corner-case bugfixes to list. * Claim that we don't do gzip in some cases, to avoid evil 2007-04-21 1.70 Tom Insam * Added special case for Helsingin Sanomat URLs. * Supressed warnings if imgsize can't get an image size (from Dagfinn Ilmari Mannsaker) * Fixed some more nagging unicode issues. Bah, unicode. * Send a real user-agent, to prevent people that hard-code a block of the default Perl useragent from blocking us. 2006-05-18 1.62 Tom Insam * character set support that actually _works_ - we also try utf-8 first always, because people lie. * Fix for servers that don't respect the Range header. 2005-07-21 1.61 Tom Insam * Fix tiny test breakage * Better charset support 2004-09-23 1.60 Tom Insam * Added limited character set support. It mostly sucks, but less than it did before. 2004-08-13 1.50 Tom Insam * Added quick special case for iTMS urls. * Bumped version to 1.0, I quite like this code now. 2004-05-06 0.50 Tom Insam * 0.5 Release * Use File::Type to detech mime types and hand off to sub-modules * Use Module::Pluggable to discover sub-modules * Add Image, MP3 and PDF naming modules 2004-03-16 0.30 Tom Insam * 0.3 Release (Revision 643) * Don't run tests without a net connection * More special cases. * Variable header size downloading for pathological cases * Much better whitespace trimming 2004-01-17 0.10 Tom Insam * 0.1 Release. It doesn't suck, much. URI-Title-1.86/lib/000755 000765 000024 00000000000 11760207337 014055 5ustar00tomistaff000000 000000 URI-Title-1.86/Makefile.PL000644 000765 000024 00000000563 11026666763 015274 0ustar00tomistaff000000 000000 use ExtUtils::MakeMaker; require 5.008001; WriteMakefile( 'NAME' => 'URI::Title', 'VERSION_FROM' => 'lib/URI/Title.pm', 'PREREQ_PM' => { 'Test::More' => 0, 'Module::Pluggable' => '1.2', 'File::Type' => '0.22', 'HTML::Parser' => '3.45', 'MP3::Info' => 0, 'Image::Size' => 0, 'LWP::Simple' => 0 } ); URI-Title-1.86/MANIFEST000644 000765 000024 00000000414 11760207337 014437 0ustar00tomistaff000000 000000 Changes lib/URI/Title.pm lib/URI/Title/HTML.pm lib/URI/Title/Image.pm lib/URI/Title/MP3.pm lib/URI/Title/PDF.pm Makefile.PL MANIFEST t/00bootstrap.t t/failure.t t/html.t t/other.t title.pl META.yml Module meta-data (added by MakeMaker) URI-Title-1.86/META.yml000644 000765 000024 00000001215 11760207337 014557 0ustar00tomistaff000000 000000 --- #YAML:1.0 name: URI-Title version: 1.86 abstract: ~ author: [] license: unknown distribution_type: module configure_requires: ExtUtils::MakeMaker: 0 build_requires: ExtUtils::MakeMaker: 0 requires: File::Type: 0.22 HTML::Parser: 3.45 Image::Size: 0 LWP::Simple: 0 Module::Pluggable: 1.2 MP3::Info: 0 Test::More: 0 no_index: directory: - t - inc generated_by: ExtUtils::MakeMaker version 6.56 meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 URI-Title-1.86/t/000755 000765 000024 00000000000 11760207337 013552 5ustar00tomistaff000000 000000 URI-Title-1.86/title.pl000755 000765 000024 00000000273 10407557423 014773 0ustar00tomistaff000000 000000 #!/usr/bin/perl use warnings; use strict; use lib 'lib'; use URI::Title qw(title); use Encode; my $title = title(shift); binmode STDOUT, ":utf8"; print $title || 'no title'; print "\n"; URI-Title-1.86/t/00bootstrap.t000644 000765 000024 00000000140 07775377536 016133 0ustar00tomistaff000000 000000 #!perl -w use strict; use lib qw(lib ../lib); use Test::More tests => 1; use_ok('URI::Title'); URI-Title-1.86/t/failure.t000644 000765 000024 00000000310 10002306720 015340 0ustar00tomistaff000000 000000 #!perl -w use strict; use lib qw(lib ../lib); use Test::More tests => 1; use URI::Title qw(title); # it's much easier to test for failure is(title('nonsense'), undef, "Title of nonsense is blank"); URI-Title-1.86/t/html.t000644 000765 000024 00000001304 11571177235 014704 0ustar00tomistaff000000 000000 use warnings; use strict; use Test::More; use lib 'lib'; use URI::Title qw(title); require IO::Socket; my $s = IO::Socket::INET->new( PeerAddr => "www.yahoo.com:80", Timeout => 10, ); if ($s) { close($s); plan tests => 2; } else { plan skip_all => "no net connection available"; exit; } #is( # title('http://jerakeen.org/test/uri-title.html'), # "URI::Title test", # "got title for jerakeen.org"); ok( title('http://theregister.co.uk/content/6/34549.html') =~ /lack of technology may harm your prospects/, "got register title"); ok( title('http://twitter.com/al3x/status/1039647490') eq 'twitter - Arianna Huffington: not a good saleswoman for blogging.', "got Twitter status"); URI-Title-1.86/lib/URI/000755 000765 000024 00000000000 11760207337 014514 5ustar00tomistaff000000 000000 URI-Title-1.86/lib/URI/Title/000755 000765 000024 00000000000 11760207337 015575 5ustar00tomistaff000000 000000 URI-Title-1.86/lib/URI/Title.pm000644 000765 000024 00000014275 11760207226 016141 0ustar00tomistaff000000 000000 =head1 NAME URI::Title - get the titles of things on the web in a sensible way =head1 SYNOPSIS use URI::Title qw( title ); my $title = title('http://microsoft.com'); print "Title is $title\n"; =head1 DESCRIPTION I keep having to find the title of things on the web. This seems like a really simple request, just get() the object, parse for a title tag, you're done. Ha, I wish. There are several problems with this approach: =over 4 =item What if the resource is on a very slow server? Do we wait for ever or what? =item What if the resource is a 900 gig file? You don't want to download that. =item What if the page title isn't in a title tag, but is buried in the HTML somewhere? =item What if the resource is an MP3 file, or a word document or something? =item ... =back So, let's solve these issues once. =head1 METHODS only one, the title(url) method. Call it with an url, get the title if possible, undef if it wasn't. Very simple. =head1 TODO Many, many, many things. Still unimplemented: =over 4 =item Get titles of MP3 files, Word Docs, PDFs, etc. =item Configurable.. well, anything, in fact. Timeout would be a good start. =item Better error reporting. =head1 AUTHOR Tom Insam Etom@jerakeen.orgE This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 CREDITS Invented because of a conversation with rjp, who contributed some eyeball-melting and as-yet-unused code to get titles from MP3s and PDFs, and hex, who has also solved the problem, and got bits done in a nicer way than I did. =cut package URI::Title; use warnings; use strict; use base qw(Exporter); our @EXPORT_OK = qw( title ); our $VERSION = '1.86'; use Module::Pluggable (search_path => ['URI::Title'], require => 1 ); use File::Type; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; sub ua { my $ua = LWP::UserAgent->new; $ua->agent("URI::Title/$VERSION"); $ua->timeout(20); $ua->default_header('Accept-Encoding' => 'gzip'); return $ua; } sub get_limited { my $url = shift; my $size = shift || 32*1024; my $ua = ua(); $ua->max_size($size); my $req = HTTP::Request->new(GET => $url); $req->header( Range => "bytes=0-$size" ); $req->header( "Accept-Encoding" => "" ); # vox sends invalid gzipped data? my $res = eval { $ua->request($req) }; return unless $res; # useragent explodes for non-valid uris # some servers don't like the Range header. If we # get an odd 4xx response that isn't 404, just try getting # the full thing. This may be a little impolite. return get_all($url) if $res->code >= 400 and $res->code < 500 and $res->code != 404; return unless $res->is_success; if (!wantarray) { return $res->decoded_content || $res->content; } my $cset = "iso-8859-1"; # default; my $ct = $res->header("Content-type"); if ($ct =~ /charset\s*=\>?\s*\"?([\w-]+)/i) { $cset = lc($1); #warn "Got charset $cset from URI headers\n"; } return ($res->decoded_content || $res->content, $cset); } sub get_end { my $url = shift; my $size = shift || 16*1024; my $ua = ua(); my $request = HTTP::Request->new(HEAD => $url); my $response = $ua->request($request); return unless $response; # useragent explodes for non-valid uris my $length = $response->header('Content-Length'); return unless $length; # We can't get the length, and we're _not_ # going to get the whole thing. my $start = $length - $size; $ua->max_size($size); my $req = HTTP::Request->new(GET => $url); $req->header( Range => "bytes=$start-$length" ); my $res = $ua->request($req); return unless $res; # useragent explodes for non-valid uris return unless $res->is_success; return $res->decoded_content unless wantarray; my $cset = "iso-8859-1"; # default; my $ct = $res->header("Content-type"); if ($ct =~ /charset=\"?(.*)\"?$/) { $cset = $1; } return ($res->decoded_content, $cset); } sub get_all { my $url = shift; my $ua = ua(); my $req = HTTP::Request->new(GET => $url); my $res = $ua->request($req); return unless $res->is_success; return $res->decoded_content unless wantarray; my $cset = "iso-8859-1"; # default; my $ct = $res->header("Content-type"); if ($ct =~ /charset=\"?(.*)\"?$/) { $cset = $1; } return ($res->decoded_content, $cset); } # cache our $HANDLERS; sub handlers { my @plugins = plugins(); return $HANDLERS if $HANDLERS; for my $plugin (@plugins) { for my $type ($plugin->types) { $HANDLERS->{$type} = $plugin; } } return $HANDLERS; } sub title { my $param = shift; my $data; my $url; my $type; my $cset = "iso-8859-1"; # default # we can be passed a hashref. Keys are url, or data. if (ref($param)) { if ($param->{data}) { $data = $param->{data}; $data = $$data if ref($data); # we can be passed a ref to the data } elsif ($param->{url}) { $url = $param->{url}; } else { use Carp qw(croak); croak("Expected a single parameter, or an 'url' or 'data' key"); } # otherwise, assume we're passed an url } else { $url = $param; } if (!$url and !$data) { warn "Need at least an url or data"; return; } # If we don't have data, we will have an url, so try to get data. if (!$data) { # url might be a filename if (-e $url) { local $/ = undef; unless (open DATA, $url) { warn "$url looks like a file and isn't"; return; } $data = ; close DATA; # If not, assume it's an url } else { # special case for itms if ($url =~ s/^itms:/http:/) { $type = "itms"; $data = 1; # we don't need it, fake it. } else { # special case for spotify $url =~ s{^(?:http://open.spotify.com/|spotify:)(\w+)[:/]}{http://spotify.url.fi/$1/}; $url =~ s{#!}{?_escaped_fragment_=}; ($data, $cset) = get_limited($url); } } } if (!$data) { #warn "Can't get content for $url"; return; } return undef unless $data; $type ||= File::Type->new->checktype_contents($data); my $handlers = handlers(); my $handler = $handlers->{$type} || $handlers->{default} or return; return $handler->title($url, $data, $type, $cset); } 1; URI-Title-1.86/lib/URI/Title/HTML.pm000644 000765 000024 00000005143 11760207020 016667 0ustar00tomistaff000000 000000 =head NAME URI::Title::HTML - get titles of html files =cut package URI::Title::HTML; use warnings; use strict; use HTML::Entities; use utf8; our $CAN_USE_ENCODE; BEGIN { eval { require Encode; Encode->import('decode') }; $CAN_USE_ENCODE = !$@; } sub types {( 'text/html', 'default', )} sub title { my ($class, $url, $data, $type, $cset) = @_; my $title; my $special_case; my $default_match = '(.+?)title($1); } # TODO - work this out from the headers of the HTML if ($data =~ /charset=\"?([\w-]+)/i) { $cset = lc($1); } if ( $CAN_USE_ENCODE ) { $data = eval { decode('utf-8', $data, 1) } || eval { decode($cset, $data, 1) } || $data; } my $found_title; if ($url =~ /use\.perl\.org\/~([^\/]+).*journal\/\d/i) { $special_case = '(.+?)<'; $title = "use.perl journal of $1 - "; } elsif ($url =~ /(pants\.heddley\.com|dailychump\.org).*#(.*)$/i) { my $id = $2; $special_case = 'id="a'.$id.'.*?>(.+?)<'; $title = "pants daily chump - "; } elsif ($url =~ /paste\.husk\.org/i) { $special_case = 'Summary: (.+?)<'; $title = "paste - "; } elsif ($url =~ /twitter.com\/(.*?)\/status(es)?\/\d+/i) { $special_case = '

([^\<]+)'; $title = "twitter - "; } elsif ($url =~ /independent\.co\.uk/i) { $special_case = '

(.+?)<'; } elsif ($url =~ /www\.hs\.fi\/english\/article/i) { $special_case = '

(.+?)

'; } elsif ($url =~ /google.com/i and $data =~ /calc_img/) { # google can be used as a calculator. Try to find the result. $special_case = 'calc_img.*(.+?)(.+?)<\/sup>/^$1/g; # for the google math output $found_title =~ s/<.*?>//g; $title .= $found_title; $title =~ s/\s+$//; $title =~ s/^\s+//; $title =~ s/\n+//g; $title =~ s/\s+/ /g; #use Devel::Peek; #Dump( $title ); $title = decode_entities($title); #Dump( $title ); # decode nasty number-encoded entities. Mostly works $title =~ s/(&\#(\d+);?)/chr($2)/eg; return $title; } 1; URI-Title-1.86/lib/URI/Title/Image.pm000644 000765 000024 00000000602 11003147041 017134 0ustar00tomistaff000000 000000 =head NAME URI::Title::Image - get titles of images =cut package URI::Title::Image; use warnings; use strict; use Image::Size; sub types {( 'image/gif', 'image/jpg', 'image/jpeg', 'image/png', )} sub title { my ($class, $url, $data, $type) = @_; my ($x, $y) = imgsize(\$data); $type =~ s!^[^/]*/!!; return $type unless $x && $y; return "$type ($x x $y)"; } 1; URI-Title-1.86/lib/URI/Title/MP3.pm000644 000765 000024 00000002132 10046401502 016512 0ustar00tomistaff000000 000000 =head NAME URI::Title::MP3 - get titles of MP3 files =cut package URI::Title::MP3; use warnings; use strict; use MP3::Info; use File::Temp qw(tempfile); sub types {( 'audio/mp3', )} sub get_tag { my $data = shift; my (undef, $temp) = tempfile(); open FILE, ">$temp" or die $!; print FILE $data; close FILE; my $tag = get_mp3tag($temp); if ($tag) { my $info = get_mp3info($temp); $tag->{info} = $info; } unlink($temp); return $tag; } sub title { my ($class, $url, $data, $type) = @_; my $tag; if (-f $url) { $tag = get_mp3tag($url); if ($tag) { my $info = get_mp3info($url); $tag->{info} = $info; } } else { $tag = get_tag( $data . URI::Title::get_end($url) ); } return unless $tag; return unless ($tag->{ARTIST} or $tag->{TITLE}); $tag->{ARTIST} ||= "Unknown Artist"; $tag->{TITLE} ||= "Unknown Title"; my $title = "$tag->{ARTIST} - $tag->{TITLE}"; if (my $total = $tag->{info}{SECS} and -f $url) { my $m = $total / 60; my $s = $total % 60; $title .= sprintf(" (%d:%02d)", $m, $s); } return $title; } 1; URI-Title-1.86/lib/URI/Title/PDF.pm000644 000765 000024 00000001735 10046401502 016534 0ustar00tomistaff000000 000000 =head NAME URI::Title::PDF - get titles of PDF files =cut package URI::Title::PDF; use warnings; use strict; sub types {( 'application/pdf', )} sub title { my ($class, $url, $data, $type) = @_; my %fields = (); my $content = URI::Title::get_end($url) or return; foreach my $i (qw(Producer Creator CreationDate Author Title Subject)) { my @parts = $content =~ m#/$i \((.*?)\)#mgs; $fields{$i} = $parts[-1]; # grab the last one, hopefully right } my $title = ""; my @parts = (); if ($fields{Title}) { push @parts, "$fields{Title}"; if ($fields{Author}) { push @parts, "by $fields{Author}"; } if ($fields{Subject}) { push @parts, "($fields{Subject})"; } } if ($fields{Creator} and $fields{Creator} ne 'Not Available') { push @parts, "creator: $fields{Creator}"; } if ($fields{Producer} and $fields{Producer} ne 'Not Available') { push @parts, "produced: $fields{Producer}"; } $title = join(' ', @parts); return $title; } 1;