\n" if $multipleURLs;
print "
$documentChecked{$lang}
\n
\n";
if (length $outputURL) {
print " - URL: $outputURL
\n";
} elsif (defined $file && length $file) {
print " - File: " . escapeHTML($file) . "
\n";
}
print " - Last modified: " . escapeHTML($lastModified) . "
\n" if $lastModified;
# Print character encoding information
print "$encodingMsg";
# Print level of HTML checked
print " - $levelOfHTML{$lang} " . $htmlLink{"${lang}-$htmlLevel"} . "
\n";
if ($noValid) {
print " - $wellformednessCheck{$lang}
\n";
}
print "
\n";
}
# Report errors
if ($#errors > -1 || $#externalErrors > -1) {
if ($warnings) {
print "
$errorsAndWarningsHeading{$lang}
\n";
} else {
print "
$errorsHeading{$lang}
\n";
}
print "
\n";
foreach (@externalErrors) {
my @error = split(/:/, $_, 7);
# Determine URL containing the error
my $errorURL;
if ($error[1] =~ /(.+)/o) {
$errorURL = "$1:$error[2]";
}
my $lineNumber = $error[3];
my $character = $error[4] + 1;
my $escapedURL = escapeHTML($errorURL);
print "- $escapedURL, " .
lc($lineNumberText{$lang}) .
"$lineNumber, $characterNumberText{$lang}$character: ";
if ($error[6]) {
print escapeHTML($error[6]);
} else {
print escapeHTML($error[5]);
}
print "
\n";
}
foreach (@errors) {
my @error = split(/:/, $_, 6);
# I don't think this should happen, but I'm not sure
next if $#error < 4;
# Determine line number and character of error
my $lineNumber = $error[2] - $lineAdjust;
next unless $lineNumber > 0;
my $character = $error[3] + 1;
if ($input) {
my $urlNumber = "";
if ($multipleURLs) {
$urlNumber = "${urlsChecked}-";
}
print "- $lineNumberText{$lang}$lineNumber";
} else {
print "
- $lineNumberText{$lang}$lineNumber";
}
print ", $characterNumberText{$lang}$character:\n";
my $oneChar = ($multibyte{$charset} || '.');
# Extract relevant section of HTML source.
# Perl segfaults on the extraction regexp with UTF-8 encoding and
# very long lines (as seen with http://www.msn.com). Just skip
# the extract if conditions are ripe for such a segfault.
if ($character < 10000 || $oneChar eq '.') {
my ($line, $preMatch, $maxMatch, $spacesToAdd, $extract, $insertedSpaces, $tabcount, $lineLength);
$line = superChomp($lines[$lineNumber-1]);
$lineLength = ulength($line, $oneChar);
$preMatch = max(0, $character - $extraChars);
$maxMatch = 2 * $extraChars;
if ($oneChar eq '.') {
$extract = substr($line, $preMatch, $maxMatch);
} else {
($extract) = ($line =~ /
(?:$oneChar)
{$preMatch}
((?:$oneChar)
{1,$maxMatch})/x);
}
$spacesToAdd = $error[3];
# Expand tabs in the first part of the string to ensure that
# our character pointer lines up correctly
($insertedSpaces, $tabcount) = (0, 0);
if ($extract =~ /\t/o) {
my ($firstPart, $secondPart) =
($extract =~ /^(
(?:$oneChar)
{0,$spacesToAdd})
(.*)$/sx);
($insertedSpaces, $tabcount, $firstPart) = tabExpand($firstPart);
$extract = "$firstPart$secondPart";
$spacesToAdd = $spacesToAdd - $tabcount + $insertedSpaces;
}
if (length($extract) > 0) {
$extract = "
" . escapeHTML($extract) . "
";
# Check if the line was truncated for the extract
if ($preMatch > 0) {
$extract = "... $extract";
$spacesToAdd = $extraChars + 3 - $tabcount + $insertedSpaces;
}
if ($preMatch + $maxMatch < $lineLength) {
$extract = "$extract ...";
}
# Link element names in extract
$extract = linkElements($extract);
print "$extract\n";
print ' ' x $spacesToAdd;
print "^
\n";
}
}
# Prepare error message, adding emphasis and links where appropriate
my $errorMsg;
if ($error[5]) {
$errorMsg = superChomp(escapeHTML($error[5]));
} else {
$errorMsg = superChomp(escapeHTML($error[4]));
}
while ($errorMsg =~ m#\{\{(?:")?(.+?)(?:")?\}\}#gos) {
my $linkText = $1;
my $lcLinkText = lc($linkText);
if ($links{$lcLinkText}) {
$errorMsg =~ s#\{\{(")?$linkText(")?\}\}# $1$linkText$2#;
} else {
$errorMsg =~ s#\{\{(")?$linkText(")?\}\}# $1$linkText$2#;
}
}
# Workaround for the incorrect display of the following error:
# value of attribute "NOWRAP" cannot be ""; must be one of
# "NOWRAP"
$errorMsg =~ s#""#""#go;
$errorMsg =~ s#"(.+?)"#$1#g;
print "";
if ($error[4] eq 'E' || $error[4] eq 'X') { # Error message
print "$preError{$lang}";
} elsif ($error[4] eq 'W') { # warning
print "$preWarning{$lang}";
}
print "$errorMsg
\n";
}
print "
\n";
if ($maxErrors) {
print "
The maximum number of errors was reached. Further errors in the document have not been reported.
\n";
}
} else { # no errors
print "
$noErrors{$lang}
\n" unless $hidevalid;
}
# Show input if desired
if ($input && (!$hidevalid || $#errors > -1 || $#externalErrors > -1)) {
my $cite = "";
$cite = " cite=\"$outputURL\"" if $outputURL;
print "
$inputHeading{$lang}
\n
";
my $line;
my $lineCount = 1;
# Determine maximum number of digits for a line number
my $maxNumLength = length($#lines + 1);
foreach $line (@lines) {
$line = superChomp($line);
# Add spaces to right-align line numbers
my $addedSpaces = $maxNumLength - length($lineCount);
print ' ' x $addedSpaces;
my $urlNumber = "";
if ($multipleURLs) {
$urlNumber = "${urlsChecked}-";
}
print "$lineCount " . linkElements(escapeHTML($line)) . "
\n";
$lineCount++;
}
print "
\n";
}
if ($multipleURLs) {
$urlsChecked++;
print "
\n" if (!$hidevalid || $#errors > -1 || $#externalErrors > -1);
# Update list of links to spider
my $link;
foreach $link (@documentLinks) {
if ($link->scheme =~ /^(?:https?)|(?:ftp)$/i) {
push(@urls, $link);
}
}
}
}
print "Checked $urlsChecked page" . ($urlsChecked != 1 ? "s" : "") . ".
\n" if ($multipleURLs);
# Output footer
&printFile("$endingHTML.$lang");
# Fetch a document and return it
# Takes the URL as the first argument
# The URL is assumed to have been checked for basic validity (e.g., that it
# begins with "http://" or "ftp://").
# Calls &error if the document cannot be retrieved
sub getDocument {
my $url = shift;
if (!defined $userAgent) {
if ($spider) {
$userAgent = SpiderUA->new($spiderUA, $spiderFrom);
$userAgent->delay(0);
} else {
$userAgent = new LWP::UserAgent;
$userAgent->agent("$normalUA");
}
$userAgent->protocols_allowed(['http','https','ftp']);
}
# Prepare request
my $request = new HTTP::Request 'GET' => $url;
if (defined $acceptHeader) {
$request->header(Accept => $acceptHeader);
}
# Receive response
my $response;
if ($spider) {
$response = $userAgent->request($request, \&requestCallback);
} else {
$response = $userAgent->request($request);
}
# Determine URL of document. This may be different from the original
# request URL if we were redirected.
if (defined $response->request) {
$url = $response->request->url;
}
# Check return status
if ($response->is_success) {
# Bail out if we're spidering and we found a non-HTML/XML/SGML document
return 0 if $spider && (!checkContentType($response->content_type)
|| ($response->content_encoding &&
$response->content_encoding ne 'identity'));
# Determine character encoding of document
my $contentType = $response->header('Content-Type');
my $charset = "";
if ($contentType && $contentType =~ /[\s;]charset\s*=\s*"?([^,"\s]+)/io) {
$charset = $1;
}
# Grab Last-Modified header
my $lastModified = $response->header('Last-Modified');
# Expand found links' URLs to absolute ones if spidering
if ($spider) {
my $base = $response->base;
@documentLinks = map { $_ = URI->new_abs($_, $base); } @documentLinks;
} else {
# If we're not spidering, set the document to the content held by
# the response object. If we are spidering, the content is stored
# as it's received and parsed.
$document = $response->content;
}
return ($url, $charset, $lastModified, 1,
isXMLType($response->content_type));
} else {
return 0 if ($spider && $response->message eq 'Forbidden by spider rules');
&printHeader('ISO-8859-1', $lang) unless $multipleURLs;
&printFile("$beginningHTML.$lang") unless $multipleURLs;
my $outputURL = escapeHTML($url);
&error("Error retrieving