URL-Search-0.000005/0000755000175000017500000000000013154775323012603 5ustar maukemaukeURL-Search-0.000005/Makefile_PL_settings.plx0000644000175000017500000000075613154771575017375 0ustar maukemaukeuse strict; use warnings; return { NAME => 'URL::Search', AUTHOR => q{Lukas Mai }, MIN_PERL_VERSION => '5.10.0', CONFIGURE_REQUIRES => {}, BUILD_REQUIRES => {}, TEST_REQUIRES => { 'utf8' => 0, 'Test::More' => 0, }, PREREQ_PM => { 'strict' => 0, 'warnings' => 0, 'Exporter' => 5.57, }, DEVELOP_REQUIRES => { 'Test::Pod' => 1.22, }, REPOSITORY => [ github => 'mauke' ], }; URL-Search-0.000005/README0000644000175000017500000000222113154775323013460 0ustar maukemaukeNAME URL::Search - search for URLs in plain text INSTALLATION To download and install this module, use your favorite CPAN client, e.g. "cpan": cpan URL::Search Or "cpanm": cpanm URL::Search To do it manually, run the following commands (after downloading and unpacking the tarball): perl Makefile.PL make make test make install SUPPORT AND DOCUMENTATION After installing, you can find documentation for this module with the "perldoc" command. perldoc URL::Search You can also look for information at . To see a list of open bugs, visit . To report a new bug, send an email to "bug-URL-Search [at] rt.cpan.org". COPYRIGHT & LICENSE Copyright 2016, 2017 Lukas Mai. This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License. See for more information. URL-Search-0.000005/Changes0000644000175000017500000000067613154775036014110 0ustar maukemaukeRevision history for URL-Search 0.000005 2017-09-09 - allow unescaped { } \ in query strings (apparently Pale Moon does this) 0.000004 2017-04-16 - don't rely on '.' in @INC 0.000003 2016-09-20 - allow various Unicode hyphens/dashes because Wikipedia uses EN DASH in paths 0.000002 2016-09-04 - change version number format 0.000001 2016-09-04 - initial release URL-Search-0.000005/lib/0000755000175000017500000000000013154775322013350 5ustar maukemaukeURL-Search-0.000005/lib/URL/0000755000175000017500000000000013154775322014012 5ustar maukemaukeURL-Search-0.000005/lib/URL/Search.pm0000644000175000017500000002034013154775036015556 0ustar maukemaukepackage URL::Search; use strict; use warnings; use v5.10.0; # recursive regex subgroups use Exporter 5.57 qw(import); our $VERSION = '0.000005'; our @EXPORT_OK = qw( $URL_SEARCH_RE extract_urls partition_urls ); our $URL_SEARCH_RE = do { my $general_unicode = qr{ [^\p{ASCII}\p{Control}\p{Space}\p{Punct}] | [\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}] # HYPHEN, NON-BREAKING HYPHEN, # FIGURE DASH, EN DASH, EM DASH, HORIZONTAL BAR }xms; my $protocol = qr{ (?: [Hh][Tt] | [Ff] ) [Tt][Pp] [Ss]? }xms; my $unreserved_subdelims_colon = qr{ [a-zA-Z0-9\-._~!\$&'()*+,;=:] }xms; my $pct_enc = qr{ % [[:xdigit:]]{2} }xms; my $userinfo = qr{ $unreserved_subdelims_colon* (?: $pct_enc $unreserved_subdelims_colon* )* }xms; my $host = do { my $dec_octet = qr{ 25[0-5] | 2[0-4][0-9] | 1[0-9][0-9] | [1-9][0-9] | [0-9] }xms; my $ipv4_addr = qr{ $dec_octet (?: \. $dec_octet ){3} }xms; my $h16 = qr{ [[:xdigit:]]{1,4} }xms; my $ls32 = qr{ $h16 : $h16 | $ipv4_addr }xms; my $ipv6_addr = qr{ (?: $h16 : ){6} $ls32 | :: (?: $h16 : ){5} $ls32 | (?: $h16 )? :: (?: $h16 : ){4} $ls32 | (?: $h16 (?: : $h16 ){0,1} )? :: (?: $h16 : ){3} $ls32 | (?: $h16 (?: : $h16 ){0,2} )? :: (?: $h16 : ){2} $ls32 | (?: $h16 (?: : $h16 ){0,3} )? :: $h16 : $ls32 | (?: $h16 (?: : $h16 ){0,4} )? :: $ls32 | (?: $h16 (?: : $h16 ){0,5} )? :: $h16 | (?: $h16 (?: : $h16 ){0,6} )? :: }xms; my $ipvfuture = qr{ v [[:xdigit:]]+ \. $unreserved_subdelims_colon+ }xms; my $ip_literal = qr{ \[ (?: $ipv6_addr | $ipvfuture ) \] }xms; my $hostname = do { my $alnum = qr{ [a-zA-Z0-9] | $general_unicode }xms; my $label = qr { $alnum+ (?: -+ $alnum+ )* }xms; qr{ $label (?: \. $label )* \.? }xms }; qr{ $hostname | $ip_literal }xms }; my $path = qr{ / ( (?: [a-zA-Z0-9\-._~!\$&'*+,;=:\@/] | $pct_enc | \( (?-1) \) | $general_unicode )* ) }xms; my $query = qr{ ( (?: [a-zA-Z0-9\-._~!\$&'*+,;=:\@/?\\{}] | $pct_enc | \( (?-1) \) | \[ (?-1) \] | $general_unicode )* ) }xms; my $fragment = $query; qr{ $protocol :// (?: $userinfo \@ )? $host (?: : [0-9]+ )? $path? (?: \? $query )? (?: \# $fragment )? (? substr $text, $pos_prev, $-[0] - $pos_prev] if $pos_prev < $-[0]; push @parts, [URL => $1]; $pos_prev = $+[0]; } push @parts, [TEXT => substr $text, $pos_prev] if $pos_prev < length $text; @parts } 'ok' __END__ =encoding utf8 =for highlighter language=perl =head1 NAME URL::Search - search for URLs in plain text =head1 SYNOPSIS use URL::Search qw( $URL_SEARCH_RE extract_urls partition_urls ); if ($text =~ /($URL_SEARCH_RE)/) { print "the first URL in text was: $1\n"; } my @all_urls = extract_urls $text; =head1 DESCRIPTION This module searches plain text for URLs and extracts them. It exports (on request) the following entities: =head2 C<$URL_SEARCH_RE> This variable is the core of this module. It contains a regex that matches a URL. NOTE: This regex uses capturing groups internally, so if you embed it in a bigger pattern, the numbering of any following capture groups will be off. It only matches URLs with an explicit schema (one of C, C, C, C). The pattern is deliberately not anchored at the beginning, i.e. it will match C in C<"click herehttp://foo">. If you don't want that, use C. It tries to exclude artifacts of the surrounding text: =for highlighter Is mayonnaise an instrument? (https://en.wikipedia.org/wiki/Instrument, https://en.wikipedia.org/wiki/Mayonnaise_(instrument)) In this example it will match C and C, without the comma after "Instrument" and the final closing parenthesis. It understands all common URL elements: username, hostname, port, path, query string, fragment identifier. The hostname can be an IP address (IPv4 and IPv6 are both supported). Unicode is supported (e.g. C is matched correctly). =head2 C This function takes a string and returns a list of all contained URLs. It uses L|/C<$URL_SEARCH_RE>> to find matches. Example: =for highlighter language=perl my $text = 'Visit us at http://html5zombo.com. Also, https://archive.org'; my @urls = extract_urls $text; # @urls = ('http://html5zombo.com', 'https://archive.org') =head2 C This function takes a string and splits it up into text and URL segments. It returns a list of array references, each of which has two elements: The type (the string C<'TEXT'> or C<'URL'>) and the portion of the input string that was classified as text or URL, respectively. Example: my $text = 'Visit us at http://html5zombo.com. Also, https://archive.org'; my @parts = partition_urls $text; # @parts = ( # [ 'TEXT', 'Visit us at ' ], # [ 'URL', 'http://html5zombo.com' ], # [ 'TEXT', '. Also, ' ], # [ 'URL', 'https://archive.org' ], # ) You can reassemble the original string by concatenating the second elements of the returned arrayrefs, i.e. C<< join('', map { $_->[1] } partition_urls($text)) eq $text >>. This function can be useful if you want to render plain text as HTML but hyperlink all embedded URLs: use URL::Search qw(partition_urls); use HTML::Entities qw(encode_entities); my $text = ...; my $html = ''; for my $part (partition_urls $text) { my ($type, $str) = @$part; $str = encode_entities $str; if ($type eq 'URL') { $html .= "$str"; } else { $html .= $str; } } # result is in $html =begin :README =head1 INSTALLATION To download and install this module, use your favorite CPAN client, e.g. L|cpan>: =for highlighter language=sh cpan URL::Search Or L|cpanm>: cpanm URL::Search To do it manually, run the following commands (after downloading and unpacking the tarball): perl Makefile.PL make make test make install =end :README =head1 SUPPORT AND DOCUMENTATION After installing, you can find documentation for this module with the L|perldoc> command. =for highlighter language=sh perldoc URL::Search You can also look for information at L. To see a list of open bugs, visit L. To report a new bug, send an email to C. =head1 AUTHOR Lukas Mai, C<< >> =head1 COPYRIGHT & LICENSE Copyright 2016, 2017 Lukas Mai. This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License. See L for more information. URL-Search-0.000005/META.yml0000644000175000017500000000134113154775322014052 0ustar maukemauke--- abstract: 'search for URLs in plain text' author: - 'Lukas Mai ' build_requires: Test::More: '0' utf8: '0' configure_requires: ExtUtils::MakeMaker: '0' File::Find: '0' File::Spec: '0' strict: '0' warnings: '0' dynamic_config: 0 generated_by: 'ExtUtils::MakeMaker version 7.3, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: URL-Search no_index: directory: - t - inc - xt requires: Exporter: '5.57' perl: '5.010000' strict: '0' warnings: '0' resources: repository: git://github.com/mauke/URL-Search version: '0.000005' x_serialization_backend: 'CPAN::Meta::YAML version 0.018' URL-Search-0.000005/xt/0000755000175000017500000000000013154775322013235 5ustar maukemaukeURL-Search-0.000005/xt/pod.t0000644000175000017500000000011312762567052014202 0ustar maukemauke#!perl use strict; use warnings; use Test::Pod 1.22; all_pod_files_ok(); URL-Search-0.000005/META.json0000644000175000017500000000305513154775323014227 0ustar maukemauke{ "abstract" : "search for URLs in plain text", "author" : [ "Lukas Mai " ], "dynamic_config" : 0, "generated_by" : "ExtUtils::MakeMaker version 7.3, CPAN::Meta::Converter version 2.150010", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : 2 }, "name" : "URL-Search", "no_index" : { "directory" : [ "t", "inc", "xt" ] }, "prereqs" : { "build" : { "requires" : {} }, "configure" : { "requires" : { "ExtUtils::MakeMaker" : "0", "File::Find" : "0", "File::Spec" : "0", "strict" : "0", "warnings" : "0" } }, "develop" : { "requires" : { "Pod::Markdown" : "3.005", "Pod::Text" : "4.09", "Test::Pod" : "1.22" } }, "runtime" : { "requires" : { "Exporter" : "5.57", "perl" : "5.010000", "strict" : "0", "warnings" : "0" } }, "test" : { "requires" : { "Test::More" : "0", "utf8" : "0" } } }, "release_status" : "stable", "resources" : { "repository" : { "type" : "git", "url" : "git://github.com/mauke/URL-Search", "web" : "https://github.com/mauke/URL-Search" } }, "version" : "0.000005", "x_serialization_backend" : "JSON::PP version 2.94" } URL-Search-0.000005/Makefile.PL0000644000175000017500000001221413154771325014553 0ustar maukemaukeuse strict; use warnings; use ExtUtils::MakeMaker; use File::Spec (); use File::Find (); sub find_tests_recursively_in { my ($dir) = @_; -d $dir or die "$dir is not a directory"; my %seen; my $wanted = sub { /\.t\z/ or return; my $directories = (File::Spec->splitpath($File::Find::name))[1]; my $depth = grep $_ ne '', File::Spec->splitdir($directories); $seen{$depth} = 1; }; File::Find::find($wanted, $dir); join ' ', map { $dir . '/*' x $_ . '.t' } sort { $a <=> $b } keys %seen } $::MAINT_MODE = !-f 'META.yml'; my $settings_file = 'Makefile_PL_settings.plx'; my %settings = %{do "./$settings_file" or die "Internal error: can't do $settings_file: ", $@ || $!}; { $settings{depend}{Makefile} .= " $settings_file"; $settings{LICENSE} ||= 'perl'; $settings{PL_FILES} ||= {}; $settings{CONFIGURE_REQUIRES}{strict} ||= 0; $settings{CONFIGURE_REQUIRES}{warnings} ||= 0; $settings{CONFIGURE_REQUIRES}{'ExtUtils::MakeMaker'} ||= 0; $settings{CONFIGURE_REQUIRES}{'File::Find'} ||= 0; $settings{CONFIGURE_REQUIRES}{'File::Spec'} ||= 0; my $module_file = $settings{NAME}; $module_file =~ s!::!/!g; $module_file = "lib/$module_file.pm"; $settings{VERSION_FROM} ||= $module_file; $settings{ABSTRACT_FROM} ||= $module_file; $settings{test}{TESTS} ||= find_tests_recursively_in 't'; $settings{DISTNAME} ||= do { my $name = $settings{NAME}; $name =~ s!::!-!g; $name }; $settings{clean}{FILES} ||= "$settings{DISTNAME}-*"; $settings{dist}{COMPRESS} ||= 'gzip -9f'; $settings{dist}{SUFFIX} ||= '.gz'; my $version = $settings{VERSION} || MM->parse_version($settings{VERSION_FROM}); if ($version =~ s/-TRIAL[0-9]*\z//) { $settings{META_MERGE}{release_status} ||= 'unstable'; $settings{META_MERGE}{version} ||= $version; $settings{XS_VERSION} ||= $version; } $settings{META_MERGE}{'meta-spec'}{version} ||= 2; $settings{META_MERGE}{dynamic_config} ||= 0; push @{$settings{META_MERGE}{no_index}{directory}}, 'xt'; if (my $dev = delete $settings{DEVELOP_REQUIRES}) { @{$settings{META_MERGE}{prereqs}{develop}{requires}}{keys %$dev} = values %$dev; } if (my $rec = delete $settings{RECOMMENDS}) { @{$settings{META_MERGE}{prereqs}{runtime}{recommends}}{keys %$rec} = values %$rec; } if (my $sug = delete $settings{SUGGESTS}) { @{$settings{META_MERGE}{prereqs}{runtime}{suggests}}{keys %$sug} = values %$sug; } if (my $repo = delete $settings{REPOSITORY}) { if (ref($repo) eq 'ARRAY') { my ($type, @args) = @$repo; if ($type eq 'github') { my ($account, $project) = @args; $project ||= '%d'; $project =~ s{%(L?)(.)}{ my $x = $2 eq '%' ? '%' : $2 eq 'd' ? $settings{DISTNAME} : $2 eq 'm' ? $settings{NAME} : die "Internal error: unknown placeholder %$1$2"; $1 ? lc($x) : $x }seg; my $addr = "github.com/$account/$project"; $repo = { type => 'git', url => "git://$addr", web => "https://$addr", }; } else { die "Internal error: unknown REPOSITORY type '$type'"; } } ref($repo) eq 'HASH' or die "Internal error: REPOSITORY must be a hashref, not $repo"; @{$settings{META_MERGE}{resources}{repository}}{keys %$repo} = values %$repo; } } (do './maint/eumm-fixup.pl' || die $@ || $!)->(\%settings) if $::MAINT_MODE; (my $mm_version = ExtUtils::MakeMaker->VERSION) =~ tr/_//d; if ($mm_version < 6.63_03) { $settings{META_MERGE}{resources}{repository} = $settings{META_MERGE}{resources}{repository}{url} if $settings{META_MERGE}{resources} && $settings{META_MERGE}{resources}{repository} && $settings{META_MERGE}{resources}{repository}{url}; delete $settings{META_MERGE}{'meta-spec'}{version}; } elsif ($mm_version < 6.67_04) { # Why? For the glory of satan, of course! no warnings qw(redefine); *ExtUtils::MM_Any::_add_requirements_to_meta_v1_4 = \&ExtUtils::MM_Any::_add_requirements_to_meta_v2; } { my $merge_key_into = sub { my ($target, $source) = @_; %{$settings{$target}} = (%{$settings{$target}}, %{delete $settings{$source}}); }; $merge_key_into->('BUILD_REQUIRES', 'TEST_REQUIRES') if $mm_version < 6.63_03; $merge_key_into->('CONFIGURE_REQUIRES', 'BUILD_REQUIRES') if $mm_version < 6.55_01; $merge_key_into->('PREREQ_PM', 'CONFIGURE_REQUIRES') if $mm_version < 6.51_03; } delete $settings{MIN_PERL_VERSION} if $mm_version < 6.47_01; delete $settings{META_MERGE} if $mm_version < 6.46; delete $settings{LICENSE} if $mm_version < 6.30_01; delete $settings{ABSTRACT_FROM} if $mm_version < 6.06_03; delete $settings{AUTHOR} if $mm_version < 6.06_03; WriteMakefile %settings; URL-Search-0.000005/MANIFEST0000644000175000017500000000056713154775323013744 0ustar maukemaukeChanges lib/URL/Search.pm Makefile.PL Makefile_PL_settings.plx MANIFEST MANIFEST.SKIP t/test.t xt/pod.t META.yml Module YAML meta-data (added by MakeMaker) META.json Module JSON meta-data (added by MakeMaker) README generated from URL::Search POD (added by maint/eumm-fixup.pl) URL-Search-0.000005/t/0000755000175000017500000000000013154775322013045 5ustar maukemaukeURL-Search-0.000005/t/test.t0000644000175000017500000000313013154775036014210 0ustar maukemauke#!perl use strict; use warnings FATAL => 'all'; use utf8; use Test::More tests => 4; use URL::Search qw($URL_SEARCH_RE extract_urls partition_urls); my @results = ( [TEXT => 'Check out '], [URL => 'http://example.com/1'], [TEXT => ', '], [URL => 'https://example.com?page[id]=2'], [TEXT => ", and\n"], [URL => 'http://127.1:8080/cgi-bin/render.dll?index#query'], [TEXT => '. More: '], [URL => 'ftp://host/dir/file.tar.gz'], [TEXT => ".\nClick here"], [URL => 'http://A4.paper'], [TEXT => "\n("], [URL => 'Http://user@site/?a=b&c=d;e=f'], [TEXT => ' and '], [URL => 'http://en.wikipedia.org/wiki/Mayonnaise_(instrument)'], [TEXT => ")\n<"], [URL => 'https://a.#b'], [TEXT => '> ('], [URL => 'http://c/d'], [TEXT => ') ['], [URL => 'http://[::1]/sweet-home'], [TEXT => "]\n"], [URL => 'http://déjà-vu/€?utf8=✓'], [TEXT => ' - '], [URL => 'https://en.wikipedia.org/wiki/Hornbostel–Sachs'], [TEXT => ' '], [URL => 'http://موقع.وزارة-الأتصالات.مصر/最近更改'], [TEXT => "\netc."], [URL => 'http://поддомен.example.com/déjà-vu?utf8=✓'], [TEXT => "\n"], [URL => 'https://grep.metacpan.org/search?size=20&q=map\s*\{\s*\%24_\s*\}&qd=&qft='], [TEXT => ' # incidentally'], ); my $corpus = join '', map $_->[1], @results; #diag $corpus; is_deeply [partition_urls $corpus], \@results; is_deeply [extract_urls $corpus], [map $_->[0] eq 'URL' ? $_->[1] : (), @results]; ok $corpus =~ /$URL_SEARCH_RE/; is substr($corpus, $-[0], $+[0] - $-[0]), $results[1][1]; URL-Search-0.000005/MANIFEST.SKIP0000644000175000017500000000023712763002612014470 0ustar maukemauke\.tar\.gz$ ^Build$ ^URL-Search- ^MANIFEST\.(?!SKIP$) ^MYMETA\. ^Makefile$ ^Makefile\.old$ ^\. ^_build ^blib ^cover_db$ ^pm_to_blib ^remote$ ^untracked ^maint/