URL-Search-0.06/0000755000175000017500000000000014406141226012272 5ustar maukemaukeURL-Search-0.06/Makefile_PL_settings.plx0000644000175000017500000000072214406133411017045 0ustar maukemaukeuse strict; use warnings; return { NAME => 'URL::Search', AUTHOR => q{Lukas Mai }, MIN_PERL_VERSION => '5.10.0', CONFIGURE_REQUIRES => {}, BUILD_REQUIRES => {}, TEST_REQUIRES => { 'Test2::V0' => 0, }, PREREQ_PM => { 'strict' => 0, 'warnings' => 0, 'Exporter' => 5.57, }, DEVELOP_REQUIRES => { 'Test::Pod' => 1.22, }, REPOSITORY => [ github => 'mauke' ], }; URL-Search-0.06/Makefile.PL0000644000175000017500000001221414406131403014240 0ustar maukemaukeuse strict; use warnings; use ExtUtils::MakeMaker; use File::Spec (); use File::Find (); sub find_tests_recursively_in { my ($dir) = @_; -d $dir or die "$dir is not a directory"; my %seen; my $wanted = sub { /\.t\z/ or return; my $directories = (File::Spec->splitpath($File::Find::name))[1]; my $depth = grep $_ ne '', File::Spec->splitdir($directories); $seen{$depth} = 1; }; File::Find::find($wanted, $dir); join ' ', map { $dir . '/*' x $_ . '.t' } sort { $a <=> $b } keys %seen } $::MAINT_MODE = !-f 'META.yml'; my $settings_file = 'Makefile_PL_settings.plx'; my %settings = %{do "./$settings_file" or die "Internal error: can't do $settings_file: ", $@ || $!}; { $settings{depend}{Makefile} .= " $settings_file"; $settings{LICENSE} ||= 'perl'; $settings{PL_FILES} ||= {}; $settings{CONFIGURE_REQUIRES}{strict} ||= 0; $settings{CONFIGURE_REQUIRES}{warnings} ||= 0; $settings{CONFIGURE_REQUIRES}{'ExtUtils::MakeMaker'} ||= 0; $settings{CONFIGURE_REQUIRES}{'File::Find'} ||= 0; $settings{CONFIGURE_REQUIRES}{'File::Spec'} ||= 0; my $module_file = $settings{NAME}; $module_file =~ s!::!/!g; $module_file = "lib/$module_file.pm"; $settings{VERSION_FROM} ||= $module_file; $settings{ABSTRACT_FROM} ||= $module_file; $settings{test}{TESTS} ||= find_tests_recursively_in 't'; $settings{DISTNAME} ||= do { my $name = $settings{NAME}; $name =~ s!::!-!g; $name }; $settings{clean}{FILES} ||= "$settings{DISTNAME}-*"; $settings{dist}{COMPRESS} ||= 'gzip -9f'; $settings{dist}{SUFFIX} ||= '.gz'; my $version = $settings{VERSION} || MM->parse_version($settings{VERSION_FROM}); if ($version =~ s/-TRIAL[0-9]*\z//) { $settings{META_MERGE}{release_status} ||= 'unstable'; $settings{META_MERGE}{version} ||= $version; $settings{XS_VERSION} ||= $version; } $settings{META_MERGE}{'meta-spec'}{version} ||= 2; $settings{META_MERGE}{dynamic_config} ||= 0; push @{$settings{META_MERGE}{no_index}{directory}}, 'xt'; if (my $dev = delete $settings{DEVELOP_REQUIRES}) { @{$settings{META_MERGE}{prereqs}{develop}{requires}}{keys %$dev} = values %$dev; } if (my $rec = delete $settings{RECOMMENDS}) { @{$settings{META_MERGE}{prereqs}{runtime}{recommends}}{keys %$rec} = values %$rec; } if (my $sug = delete $settings{SUGGESTS}) { @{$settings{META_MERGE}{prereqs}{runtime}{suggests}}{keys %$sug} = values %$sug; } if (my $repo = delete $settings{REPOSITORY}) { if (ref($repo) eq 'ARRAY') { my ($type, @args) = @$repo; if ($type eq 'github') { my ($account, $project) = @args; $project ||= '%d'; $project =~ s{%(L?)(.)}{ my $x = $2 eq '%' ? '%' : $2 eq 'd' ? $settings{DISTNAME} : $2 eq 'm' ? $settings{NAME} : die "Internal error: unknown placeholder %$1$2"; $1 ? lc($x) : $x }seg; my $addr = "github.com/$account/$project"; $repo = { type => 'git', url => "git://$addr", web => "https://$addr", }; } else { die "Internal error: unknown REPOSITORY type '$type'"; } } ref($repo) eq 'HASH' or die "Internal error: REPOSITORY must be a hashref, not $repo"; @{$settings{META_MERGE}{resources}{repository}}{keys %$repo} = values %$repo; } } (do './maint/eumm-fixup.pl' || die $@ || $!)->(\%settings) if $::MAINT_MODE; (my $mm_version = ExtUtils::MakeMaker->VERSION) =~ tr/_//d; if ($mm_version < 6.63_03) { $settings{META_MERGE}{resources}{repository} = $settings{META_MERGE}{resources}{repository}{url} if $settings{META_MERGE}{resources} && $settings{META_MERGE}{resources}{repository} && $settings{META_MERGE}{resources}{repository}{url}; delete $settings{META_MERGE}{'meta-spec'}{version}; } elsif ($mm_version < 6.67_04) { # Why? For the glory of satan, of course! no warnings qw(redefine); *ExtUtils::MM_Any::_add_requirements_to_meta_v1_4 = \&ExtUtils::MM_Any::_add_requirements_to_meta_v2; } { my $merge_key_into = sub { my ($target, $source) = @_; %{$settings{$target}} = (%{$settings{$target}}, %{delete $settings{$source}}); }; $merge_key_into->('BUILD_REQUIRES', 'TEST_REQUIRES') if $mm_version < 6.63_03; $merge_key_into->('CONFIGURE_REQUIRES', 'BUILD_REQUIRES') if $mm_version < 6.55_01; $merge_key_into->('PREREQ_PM', 'CONFIGURE_REQUIRES') if $mm_version < 6.51_03; } delete $settings{MIN_PERL_VERSION} if $mm_version < 6.47_01; delete $settings{META_MERGE} if $mm_version < 6.46; delete $settings{LICENSE} if $mm_version < 6.30_01; delete $settings{ABSTRACT_FROM} if $mm_version < 6.06_03; delete $settings{AUTHOR} if $mm_version < 6.06_03; WriteMakefile %settings; URL-Search-0.06/META.json0000664000175000017500000000301714406141226013716 0ustar maukemauke{ "abstract" : "search for URLs in plain text", "author" : [ "Lukas Mai " ], "dynamic_config" : 0, "generated_by" : "ExtUtils::MakeMaker version 7.64, CPAN::Meta::Converter version 2.150010", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : 2 }, "name" : "URL-Search", "no_index" : { "directory" : [ "t", "inc", "xt" ] }, "prereqs" : { "build" : { "requires" : {} }, "configure" : { "requires" : { "ExtUtils::MakeMaker" : "0", "File::Find" : "0", "File::Spec" : "0", "strict" : "0", "warnings" : "0" } }, "develop" : { "requires" : { "Pod::Markdown" : "3.005", "Pod::Text" : "4.09", "Test::Pod" : "1.22" } }, "runtime" : { "requires" : { "Exporter" : "5.57", "perl" : "5.010000", "strict" : "0", "warnings" : "0" } }, "test" : { "requires" : { "Test2::V0" : "0" } } }, "release_status" : "stable", "resources" : { "repository" : { "type" : "git", "url" : "git://github.com/mauke/URL-Search", "web" : "https://github.com/mauke/URL-Search" } }, "version" : "0.06", "x_serialization_backend" : "JSON::PP version 4.07" } URL-Search-0.06/t/0000755000175000017500000000000014406141226012535 5ustar maukemaukeURL-Search-0.06/t/test.t0000644000175000017500000000312614406135107013704 0ustar maukemauke#!perl use Test2::V0; use URL::Search qw($URL_SEARCH_RE extract_urls partition_urls); my @results = ( [TEXT => 'Check out '], [URL => 'http://example.com/1'], [TEXT => ', '], [URL => 'https://example.com?page[id]=2'], [TEXT => ", and\n"], [URL => 'http://127.1:8080/cgi-bin/render.dll?index#query'], [TEXT => '. More: '], [URL => 'hTtpS://host/dir/file.tar.gz'], [TEXT => ".\nClick here"], [URL => 'http://A4.paper'], [TEXT => "\n("], [URL => 'Http://user@site/?a=b&c=d;e=f'], [TEXT => ' and '], [URL => 'http://en.wikipedia.org/wiki/Mayonnaise_(instrument)'], [TEXT => ")\n<"], [URL => 'https://a.#b'], [TEXT => '> ('], [URL => 'http://c/d'], [TEXT => ') ['], [URL => 'http://[::1]/sweet-home'], [TEXT => "]\n"], [URL => 'http://déjà-vu/€?utf8=✓'], [TEXT => ' - '], [URL => 'https://en.wikipedia.org/wiki/Hornbostel–Sachs'], [TEXT => ' '], [URL => 'http://موقع.وزارة-الأتصالات.مصر/最近更改'], [TEXT => "\netc."], [URL => 'http://поддомен.example.com/déjà-vu?utf8=✓'], [TEXT => "\n"], [URL => 'https://grep.metacpan.org/search?size=20&q=map\s*\{\s*\%24_\s*\}&qd=&qft='], [TEXT => ' # incidentally'], ); my $corpus = join '', map $_->[1], @results; #diag $corpus; is [partition_urls $corpus], \@results; is [extract_urls $corpus], [map $_->[0] eq 'URL' ? $_->[1] : (), @results]; if ($corpus =~ /$URL_SEARCH_RE/) { is substr($corpus, $-[0], $+[0] - $-[0]), $results[1][1]; } else { fail "corpus matches \$URL_SEARCH_RE"; } done_testing; URL-Search-0.06/Changes0000644000175000017500000000125314406141174013570 0ustar maukemaukeRevision history for URL-Search 0.06 2023-03-20 - drop support for ftp: and ftps: URLs because current browsers don't speak FTP anymore - switch from Test::More to Test2::V0 - switch to a version number that's less crazy 0.000005 2017-09-09 - allow unescaped { } \ in query strings (apparently Pale Moon does this) 0.000004 2017-04-16 - don't rely on '.' in @INC 0.000003 2016-09-20 - allow various Unicode hyphens/dashes because Wikipedia uses EN DASH in paths 0.000002 2016-09-04 - change version number format 0.000001 2016-09-04 - initial release URL-Search-0.06/MANIFEST.SKIP0000644000175000017500000000021214406132241014160 0ustar maukemauke(?' build_requires: Test2::V0: '0' configure_requires: ExtUtils::MakeMaker: '0' File::Find: '0' File::Spec: '0' strict: '0' warnings: '0' dynamic_config: 0 generated_by: 'ExtUtils::MakeMaker version 7.64, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: URL-Search no_index: directory: - t - inc - xt requires: Exporter: '5.57' perl: '5.010000' strict: '0' warnings: '0' resources: repository: git://github.com/mauke/URL-Search version: '0.06' x_serialization_backend: 'CPAN::Meta::YAML version 0.018' URL-Search-0.06/xt/0000755000175000017500000000000014406141226012725 5ustar maukemaukeURL-Search-0.06/xt/pod.t0000644000175000017500000000011314406131403013663 0ustar maukemauke#!perl use strict; use warnings; use Test::Pod 1.22; all_pod_files_ok(); URL-Search-0.06/MANIFEST0000644000175000017500000000056714406141226013433 0ustar maukemaukeChanges lib/URL/Search.pm Makefile.PL Makefile_PL_settings.plx MANIFEST MANIFEST.SKIP t/test.t xt/pod.t META.yml Module YAML meta-data (added by MakeMaker) META.json Module JSON meta-data (added by MakeMaker) README generated from URL::Search POD (added by maint/eumm-fixup.pl) URL-Search-0.06/README0000664000175000017500000000222114406141226013151 0ustar maukemaukeNAME URL::Search - search for URLs in plain text INSTALLATION To download and install this module, use your favorite CPAN client, e.g. "cpan": cpan URL::Search Or "cpanm": cpanm URL::Search To do it manually, run the following commands (after downloading and unpacking the tarball): perl Makefile.PL make make test make install SUPPORT AND DOCUMENTATION After installing, you can find documentation for this module with the "perldoc" command. perldoc URL::Search You can also look for information at . To see a list of open bugs, visit . To report a new bug, send an email to "bug-URL-Search [at] rt.cpan.org". COPYRIGHT & LICENSE Copyright 2016, 2017, 2023 Lukas Mai. This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License. See for more information. URL-Search-0.06/lib/0000755000175000017500000000000014406141226013040 5ustar maukemaukeURL-Search-0.06/lib/URL/0000755000175000017500000000000014406141226013502 5ustar maukemaukeURL-Search-0.06/lib/URL/Search.pm0000644000175000017500000002074314406140632015253 0ustar maukemaukepackage URL::Search; use strict; use warnings; use v5.10.0; # recursive regex subgroups use Exporter 5.57 qw(import); our $VERSION = '0.06'; our @EXPORT_OK = qw( $URL_SEARCH_RE extract_urls partition_urls ); our $URL_SEARCH_RE = do { my $general_unicode = qr{ [^\p{ASCII}\p{Control}\p{Space}\p{Punct}] | [\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}] # HYPHEN, NON-BREAKING HYPHEN, # FIGURE DASH, EN DASH, EM DASH, HORIZONTAL BAR }xms; my $protocol = qr{ [Hh][Tt][Tt][Pp] [Ss]? }xms; my $unreserved_subdelims_colon = qr{ [a-zA-Z0-9\-._~!\$&'()*+,;=:] }xms; my $pct_enc = qr{ % [[:xdigit:]]{2} }xms; my $userinfo = qr{ $unreserved_subdelims_colon* (?: $pct_enc $unreserved_subdelims_colon* )* }xms; my $host = do { my $dec_octet = qr{ 25[0-5] | 2[0-4][0-9] | 1[0-9][0-9] | [1-9][0-9] | [0-9] }xms; my $ipv4_addr = qr{ $dec_octet (?: \. $dec_octet ){3} }xms; my $h16 = qr{ [[:xdigit:]]{1,4} }xms; my $ls32 = qr{ $h16 : $h16 | $ipv4_addr }xms; my $ipv6_addr = qr{ (?: $h16 : ){6} $ls32 | :: (?: $h16 : ){5} $ls32 | (?: $h16 )? :: (?: $h16 : ){4} $ls32 | (?: $h16 (?: : $h16 ){0,1} )? :: (?: $h16 : ){3} $ls32 | (?: $h16 (?: : $h16 ){0,2} )? :: (?: $h16 : ){2} $ls32 | (?: $h16 (?: : $h16 ){0,3} )? :: $h16 : $ls32 | (?: $h16 (?: : $h16 ){0,4} )? :: $ls32 | (?: $h16 (?: : $h16 ){0,5} )? :: $h16 | (?: $h16 (?: : $h16 ){0,6} )? :: }xms; my $ipvfuture = qr{ v [[:xdigit:]]+ \. $unreserved_subdelims_colon+ }xms; my $ip_literal = qr{ \[ (?: $ipv6_addr | $ipvfuture ) \] }xms; my $hostname = do { my $alnum = qr{ [a-zA-Z0-9] | $general_unicode }xms; my $label = qr { $alnum+ (?: -+ $alnum+ )* }xms; qr{ $label (?: \. $label )* \.? }xms }; qr{ $hostname | $ip_literal }xms }; my $path = qr{ / ( (?: [a-zA-Z0-9\-._~!\$&'*+,;=:\@/] | $pct_enc | \( (?-1) \) | $general_unicode )* ) }xms; my $query = qr{ ( (?: [a-zA-Z0-9\-._~!\$&'*+,;=:\@/?\\{}] | $pct_enc | \( (?-1) \) | \[ (?-1) \] | $general_unicode )* ) }xms; my $fragment = $query; qr{ $protocol :// (?: $userinfo \@ )? $host (?: : [0-9]+ )? $path? (?: \? $query )? (?: \# $fragment )? (? substr $text, $pos_prev, $-[0] - $pos_prev] if $pos_prev < $-[0]; push @parts, [URL => $1]; $pos_prev = $+[0]; } push @parts, [TEXT => substr $text, $pos_prev] if $pos_prev < length $text; @parts } 'ok' __END__ =encoding utf8 =for github-markdown [![Coverage Status](https://coveralls.io/repos/github/mauke/URL-Search/badge.svg?branch=main)](https://coveralls.io/github/mauke/URL-Search?branch=main) =head1 NAME URL::Search - search for URLs in plain text =head1 SYNOPSIS =for highlighter language=perl use URL::Search qw( $URL_SEARCH_RE extract_urls partition_urls ); if ($text =~ /($URL_SEARCH_RE)/) { print "the first URL in text was: $1\n"; } my @all_urls = extract_urls $text; =head1 DESCRIPTION This module searches plain text for URLs and extracts them. It exports (on request) the following entities: =head2 C<$URL_SEARCH_RE> This variable is the core of this module. It contains a regex that matches a URL. NOTE: This regex uses capturing groups internally, so if you embed it in a bigger pattern, the numbering of any following capture groups will be off. If this is an issue, use named capture groups of the form C<< (?...) >> instead. See L. It only matches URLs with an explicit schema (one of C or C). The pattern is deliberately not anchored at the beginning, i.e. it will match C in C<"click herehttp://foo">. If you don't want that, use C. It tries to exclude artifacts of the surrounding text: =for highlighter Is mayonnaise an instrument? (https://en.wikipedia.org/wiki/Instrument, https://en.wikipedia.org/wiki/Mayonnaise_(instrument)) In this example it will match C and C, without the comma after "Instrument" and the final closing parenthesis. It understands all common URL elements: username, hostname, port, path, query string, fragment identifier. The hostname can be an IP address (IPv4 and IPv6 are both supported). Unicode is supported (e.g. C is matched correctly). =head2 C This function takes a string and returns a list of all contained URLs. It uses L|/C<$URL_SEARCH_RE>> to find matches. Example: =for highlighter language=perl my $text = 'Visit us at http://html5zombo.com. Also, https://archive.org'; my @urls = extract_urls $text; # @urls = ('http://html5zombo.com', 'https://archive.org') =head2 C This function takes a string and splits it up into text and URL segments. It returns a list of array references, each of which has two elements: The type (the string C<'TEXT'> or C<'URL'>) and the portion of the input string that was classified as text or URL, respectively. Example: my $text = 'Visit us at http://html5zombo.com. Also, https://archive.org'; my @parts = partition_urls $text; # @parts = ( # [ 'TEXT', 'Visit us at ' ], # [ 'URL', 'http://html5zombo.com' ], # [ 'TEXT', '. Also, ' ], # [ 'URL', 'https://archive.org' ], # ) You can reassemble the original string by concatenating the second elements of the returned arrayrefs, i.e. C<< join('', map { $_->[1] } partition_urls($text)) eq $text >>. This function can be useful if you want to render plain text as HTML but hyperlink all embedded URLs: use URL::Search qw(partition_urls); use HTML::Entities qw(encode_entities); my $text = ...; my $html = ''; for my $part (partition_urls $text) { my ($type, $str) = @$part; $str = encode_entities $str; if ($type eq 'URL') { $html .= "$str"; } else { $html .= $str; } } # result is in $html =begin :README =head1 INSTALLATION To download and install this module, use your favorite CPAN client, e.g. L|cpan>: =for highlighter language=sh cpan URL::Search Or L|cpanm>: cpanm URL::Search To do it manually, run the following commands (after downloading and unpacking the tarball): perl Makefile.PL make make test make install =end :README =head1 SUPPORT AND DOCUMENTATION After installing, you can find documentation for this module with the L|perldoc> command. =for highlighter language=sh perldoc URL::Search You can also look for information at L. To see a list of open bugs, visit L. To report a new bug, send an email to C. =head1 AUTHOR Lukas Mai, C<< >> =head1 COPYRIGHT & LICENSE Copyright 2016, 2017, 2023 Lukas Mai. This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License. See L for more information.