HTML-WikiConverter-0.68/0000755000076500000240000000000011161206511014301 5ustar diberristaffHTML-WikiConverter-0.68/bin/0000755000076500000240000000000011161206511015051 5ustar diberristaffHTML-WikiConverter-0.68/bin/html2wiki0000755000076500000240000002030410455012131016706 0ustar diberristaff#!/usr/bin/perl package main; use warnings; use strict; use HTML::WikiConverter; my %o = H::WC::GetOpts->get_opts(); my $wc = new HTML::WikiConverter( %o ); my $html = do { local $/; <> }; print $wc->html2wiki($html), "\n"; # # Attribute/option handling # package H::WC::GetOpts; use Params::Validate ':types'; use Getopt::Long; use Pod::Usage; sub get_opts { my %attrs = known_attributes(); my %o = map { $_ => undef } keys %attrs; my @optspec = ( \%o, 'list' => sub { warn "Installed dialects:\n"; print map "$_\n", HTML::WikiConverter->available_dialects; exit(1); }, 'help' => sub { pod2usage( -exit => 1, -verbose => 0 ) }, 'options' => sub { warn "Accepted options:\n"; print map "--$_\n", sort keys %attrs; exit(1); }, attrs2optspecs(\%attrs) ); eval { GetOptions( @optspec ) or pod2usage(2) }; die "problem parsing command-line options: $@" if $@; $o{dialect} ||= $ENV{WCDIALECT}; pod2usage(2) unless $o{dialect}; remove_ignored_opts(\%o); return %o; } sub known_attributes { my %attributes = ( %{ HTML::WikiConverter->__default_attribute_specs }, ); my @modules = map { "HTML::WikiConverter::$_" } HTML::WikiConverter->available_dialects; foreach my $module ( @modules ) { next unless eval "use $module; 1"; my %attrs = %{ $module->attributes }; foreach my $attr ( keys %attrs ) { $attributes{$attr} = $attrs{$attr}; } } delete $attributes{$_} for IGNORED_ATTRS(); # Normalize attr name with dashes foreach my $attr ( keys %attributes ) { my $new_attr = $attr; if( $new_attr =~ s/_/-/g ) { $attributes{$new_attr} = $attributes{$attr}; delete $attributes{$attr}; } } return %attributes; } sub attrs2optspecs { my $attrs = shift; my @specs = ( ); my %forced_optspecs = FORCED_OPTSPECS(); while( my( $attr, $spec ) = each %$attrs ) { my $type = $spec->{type} ? $spec->{type} : BOOLEAN; next if $type == CODEREF; next unless $attr; if( my $fspec = $forced_optspecs{$attr} ) { push @specs, $fspec; } else { my $reqopt_sym = ''; my $type_sym = ''; my $neg_sym = ''; $reqopt_sym = exists $spec->{optional} && !$spec->{optional} ? '=' : ':'; $type_sym = 's' if $type == SCALAR; $type_sym = 's@' if $type == ARRAYREF or $type == ( SCALAR | ARRAYREF ); $neg_sym = '!' if $type == BOOLEAN and $spec->{default} and $spec->{default} eq '1'; # No required/optional symbol without a type to follow it $reqopt_sym = '' unless $type_sym; my $spec = join '', $attr, $reqopt_sym, $type_sym, $neg_sym; push @specs, $spec; } } return sort @specs; } sub remove_ignored_opts { my $o = shift; my %ignored_attrs = map { $_ => 1 } IGNORED_ATTRS(); foreach my $key ( keys %$o ) { delete $o->{$key}, next if $ignored_attrs{$key}; delete $o->{$key}, next if !defined $o->{$key}; my $new_key = $key; if( $new_key =~ s/-/_/g ) { $o->{$new_key} = $o->{$key}; delete $o->{$key}; } } } sub IGNORED_ATTRS { qw/ list help options slurp / } # For forward compatibility until 'type' is specified for all dialect attributes sub FORCED_OPTSPECS { ( 'base-uri' => "base-uri:s", 'header-style' => "header-style:s", 'image-style' => "image-style:s", 'link-style' => "link-style:s", 'ordered-list-style' => "ordered-list-style:s", 'strip-tags' => "strip-tags:s\@", 'unordered-list-style' => "unordered-list-style:s", 'wiki-uri' => "wiki-uri:s\@", ) } __END__ =head1 NAME html2wiki - convert HTML into wiki markup =head1 SYNOPSIS html2wiki [options] [file] Commonly used options: --dialect=dialect Dialect name, e.g. "MediaWiki" (required unless the WCDIALECT environment variable is used) --encoding=encoding Source encoding (default is 'utf-8') --base-uri=uri Base URI for relative links --wiki-uri=uri URI fragment for wiki links --wrap-in-html Wrap input in and (enabled by default). Use --no-wrap-in-html to disable. --escape-entities Escape HTML entities within text elements (enabled by default). Use --no-escape-entities to disable. --list List installed dialects and exit --options List all recognized options (except for negations such as --no-wrap-in-html) --help Show this message and exit Additional options, including those corresponding to dialect attributes, are also supported. Consult the html2wiki man page for details. Example: html2wiki --dialect MediaWiki --encoding iso-8859-1 \ --base-uri http://en.wikipedia.org/wiki/ \ --wiki-uri http://en.wikipedia.org/wiki/ \ input.html > output.wiki =head1 DESCRIPTION C is a command-line interface to L, which it uses to convert HTML to wiki markup. =head1 DIALECTS If the dialect you provide in C<--dialect> is not installed on your system (e.g. if you specify C but have not installed its dialect module, L) a fatal error will be issued. Use C to list all available dialects on your system. Additional dialects may be downloaded from the CPAN. =head1 OPTIONS =head2 Correspondence of options and attributes Each of the options accepted by C corresponds to an HTML::WikiConverter attribute. Commonly used options described in C therefore correspond to attributes discussed in L. That section also contains other attributes that may be used as C command-line options. =head2 Mapping an attribute name to an option name While related, option names are not identical to their corresponding attribute names. The only difference is that attribute names use underscores to separate words while option names use hyphens. For example, the C attribute corresponds to the C<--base-uri> command-line option. =head2 Additional options defined in dialect modules Individual dialects may define their own attributes, and therefore make available their own command-line options to C, in addition to the ones defined by C. The same rules described above apply for converting between these attribute names and their corresponding command-line option names. For example, Markdown supports an C attribute that takes a string value. To use this attribute on the command line, one would use the C<--unordered-list-style> option. Consult individual dialect man pages for a list of supported attributes. =head2 Options that are enabled by default Attributes that take boolean values may be enabled by default. The C attribute is one such example. Because of this, C will effectively behave by default as if C<--wrap-in-html> had been specified in every invokation. If this is not desired, the option name may be prefixed with C to disable the option, as in C<--no-wrap-in-html>. =head2 Options that take multiple values Some attributes (eg, C and C) accept an array of values. To accommodate this in C, such options can be specified more than once on the command line. For example, to specify that only comment and script elements should be stripped from HTML: % html2wiki --strip-tags ~comment --strip-tags script ... =head1 INPUT/OUTPUT Input is taken from STDIN, so you may pipe the output from another program into C. For example: curl http://example.com/input.html | html2wiki --dialect MediaWiki You may also specify a file to read HTML from: html2wiki --dialect MediaWiki input.html Output is sent to STDOUT, though you may redirect it on the command line: html2wiki --dialect MediaWiki input.html > output.wiki Or you may pipe it into another program: html2wiki --dialect MediaWiki input.html | less =head1 AUTHOR David J. Iberri, C<< >> =head1 COPYRIGHT & LICENSE Copyright 2006 David J. Iberri, all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L =cut HTML-WikiConverter-0.68/cgi/0000755000076500000240000000000011161206511015043 5ustar diberristaffHTML-WikiConverter-0.68/cgi/index.cgi0000755000076500000240000000055011154377342016656 0ustar diberristaff#!/usr/bin/perl use warnings; use strict; use HTML::WikiConverter::WebApp; # # Configure the html-to-wiki web application. (Note that each line # ends in a comma.) # my %config = ( # Full path to the templates/ directory (eg, where main.html is) template_path => '__PATH_TO_TEMPLATES__', ); HTML::WikiConverter::WebApp->new( PARAMS => \%config )->run; HTML-WikiConverter-0.68/cgi/templates/0000755000076500000240000000000011161206511017041 5ustar diberristaffHTML-WikiConverter-0.68/cgi/templates/main.html0000644000076500000240000000721311157615420020666 0ustar diberristaff html2wiki - Convert HTML text to wiki markup

HTML-to-wiki converter

markup


Try this out in a sandbox for kicks.

Parsed HTML

HTML source

checked="checked" />
 
checked="checked" />
 
checked="checked" />

Options

  checked="checked" />
checked="checked"/>

HTML-WikiConverter-0.68/cgi/templates/sample_html.html0000644000076500000240000000737111126215443022251 0ustar diberristaff Complete example

HTML::WikiConverter

HTML::WikiConverter is a Perl module for converting HTML to wiki markup for a variety of different wiki engines. Currently, it supports many dialects:

  • DokuWiki
  • Kwiki
  • MediaWiki
  • MoinMoin
  • Oddmuse
  • PhpWiki
  • PmWiki
  • SnipSnap
  • TikiWiki
  • UseMod
  • WakkaWiki
  • (and others)

I'd really like to add support for TWiki, but the way it handles bold italics (and possibly other nested elements) is very disappointing.

Installation

It's very easy to use HTML::WikiConverter. Grab a copy from your favorite CPAN mirror and then run something like this:

  #!/usr/bin/perl -w
  use HTML::WikiConverter;
  my $wc = new HTML::WikiConverter( dialect => 'MediaWiki' );
  print $wc->html2wiki($html);

Alternatively, you might want to try HTML::WikiConverter's command line interface; it's a program called html2wiki and it's stored in bin/html2wiki.

Dialect features

Unfortunately, I haven't managed to implement every feature of each supported dialect. My initial goal was simply to create a converter for MediaWiki (I am a Wikipediholic, after all :-), but then PhpWiki's Reini Urban suggested that I be more ambitious and provide the dialect interface. And so far I've been really happy with the progress.

Bugs

Of course there are always bugs. Luckily, CPAN comes to the rescue again with its bug tracking service for module authors.

But where would we be without bugs? My Pacman frog sure wouldn't be happy without 'em! Do you know what a Pacman frog is? Or how about an Otago skink? Aplysia californica? Why, these are some of my favorite animals!

My favorite animals
Animal Region Physical traits Food
Pacman frog Gran Chaco (Argentina) Half mouth, half stomach (quite literally!) Crickets, fish, etc.
Otago skink Otago (New Zealand) Black, yellow, and green camouflage Insects, fruits, small lizards
Aplysia california California Deep red-colored sea hare Red and brown seaweed

Common features

Images

Many dialects allow embedded images, either from local or remote stores.

Google logo

Lists

Wikis also support lists, even extensively nested ones. Some rely on leading characters to determine nest levels, while others rely on spaces or (egad!) tabs.

  • 1
    • 1a
    • 1b
  • 2
    • 2a
      1. fee
      2. fie
      3. foe
        • fum?

Some wikis support definition lists, though they're often used inappropriately (as in MediaWiki, which uses them for indentation).

Perl
Pathologically eclectic rubbish lister
POE
Perl on ecstasy
HTML-WikiConverter-0.68/cgi/templates/stylesheet.css0000644000076500000240000000105511126477064021763 0ustar diberristaffbody { font-family: verdana, arial; font-size: 10pt } h1 { font-size: 16pt } h2 { font-size: 14pt } h3 { font-size: 12pt } #wrapper { margin: 5px; padding: 10px; border: 1px solid #ccccff; background: #eeeeff; } #notes { float: right; width: 100px; } textarea { width: 100%; height: 30%; } table#html_source td:last-child { width: 100%; } div#error { margin: 10px; padding-left: 10px; background: #ffcccc; border: 1px solid #990000; } #logos img { border: 0; margin: 5px; } input[type='submit'] { text-align: left; } HTML-WikiConverter-0.68/Changes0000644000076500000240000003345711161206422015611 0ustar diberristaff# Change log for HTML::WikiConverter version: 0.68 date: 2009-03-21 changes: - (bug #20594) require CSS.pm version 1.07 to fix some 'make test' errors (eg, DokuWiki) - improve some documentation version: 0.67 date: 2009-03-16 changes: - add 'p_strict' attribute for enabling/disabling HTML::TreeBuilder's p_strict option (enabled by default) -- this was done for the Markdown dialect, specifically for bug #43997 - mention XML::Writer requirement in README if cgi app is installed - webapp-install script creates webapp directory if necessary - miscellaneous minor code cleanup version: 0.66 date: 2009-03-07 changes: - fix test suite: change html2wiki/test.html to html2wiki-old/test.html - rudimentary webapp-install script version: 0.65 date: 2009-03-07 changes: - web app fixes: comment-out the donation link in templates/main.html, improved docs in cgi/index.cgi version: 0.64 date: 2009-03-06 changes: - add web application, HTML::WikiConverter::WebApp - now requires CGI::Application (for the aforementioned web app) - (bug #40845, debian #506584) allow relative wiki_uri (eg, "/wiki/"); an absolute wiki_uri is constructed from a relative wiki_uri and a base_uri - add perl license to Makefile.PL version: 0.63 date: 2008-11-11 changes: - improved support for nested blocked elements (needed, e.g., for MediaWiki support of 'p' elements within table cells, bug #37911) version: 0.62 date: 2008-05-16 changes: - blocked elements are not blocked if they are contained within a blocked element (with much thanks to Dominick Bellizzi for the patch) - rules_for_tag() now dereferences the 'alias' subrule version: 0.61 date: 2006-07-21 changes: - add HTML::Element and CSS prereqs for Normalizer.pm so that 'make test' succeeds version: 0.60 date: 2006-07-20 changes: - add CSS-to-HTML normalization so, for example, 'text' will be interpreted as 'text' before conversion to wiki markup - CSS-to-HTML normalization, which is enabled by default, can be disabled by setting the 'normalize' to a false value - add 'passthrough_naked_tags' attribute, which accepts a list of tags to be replaced with their content if the tags contain no attributes - add on-demand rule-loading so that rules that depend on attribute values will be updated when attributes are updated - add "UNKNOWN" rule as a catch-all for unknown tags - HTML can now be fetched from a URI by passing a 'uri' option to the html2wiki() method - the user agent used to fetch content from a URI can be specified in the 'user_agent' attribute - add 'passthrough' subrule for dialect module authors - rename 'remove_empty' to 'strip_empty_tags' for consistency with other attributes - improve handling of 'strip_empty_tags' attribute so that elements containing only whitespace are considered empty - documentation tweaks, including better synopsis [todo] version: 0.55 date: 2006-06-08 changes: - (bug #19429) add "escape_entities" attribute - html2wiki utility accepts command-line options for all dialect attributes - all recognized options to html2wiki now displayed with --options switch on the command line - improved error messages using carp/croak version: 0.54 date: 2006-06-07 changes: - (bug #19046) allow lone '0' in text (previously such strings were assumed empty) - (bug #19046) element attributes must be containined on a single line version: 0.53 date: 2006-06-03 changes: - revert "dialects now inherit via 'use HTML::WikiConverter -dialect;'" change - lots of documentation tweaks - allow attributes in html2wiki() - add 'remove_empty' attribute (requires HTML::Tagset) - add 'slurp' attribute (previously it was just an html2wiki() arg) - add eof() call after parse() version: 0.52 date: 2006-03-03 changes: - document the 'preprocess' attribute - remove 'wiki_page_extractor' attribute - allow 'wiki_uri' to contain coderefs that were previously used for the now-defunct 'wiki_page_extractor' attribute - use File::Slurp (if available) for file slurping - 'wrap_in_html' now applies to html read from files too - fix bug in how 'wrap_in_html' is applied to files (previously it was clobbering html read from files) - add attribute() function/class method for greater flexibility when defining attribs (hence new Params::Validate and Class::Data::Inheritable prereqs) - coalesce 'strip_*' into 'strip_tags' attribute, which accepts an arrayref - fix bug in __param() that caused Boolean attributes to return an empty string when they were set to a false value - dialects now inherit via 'use HTML::WikiConverter -dialect;' - dialects no longer have to be within the H::WC namespace - rename __param() to _attr() and document its intended use by dialect modules version: 0.51 date: 2006-01-29 changes: - add available_dialects() method, with thanks to Tatsuhiko Miyagawa for the code and suggestions - add 'preprocess' attribute, which accepts a callback to allow client to alter HTML tree prior to html2wiki conversion - improve handling of nested block elements to simplify dialect rules - allow 'wiki_uri' to contain arrayref of wiki URIs, allowing things like 'http://en.wikipedia.org/wiki/' and 'http://en.wikipedia.org/w/index.php?action=edit&title=' to be specified simultaneously - allow wiki URIs in 'wiki_uri' attribute to be either a string (as before) or a regexp, the latter capturing the wiki page title in parentheses - add 'wiki_page_extractor' attribute, which accepts a callback that extracts a wiki page title given a URI object (see bug #17330) - unescape URIs using URI::Escape so that e.g. "%3A" appears as ":" in URIs version: 0.50 date: 2006-01-10 changes: - separate each dialect into is own CPAN package for maintainability version: 0.41 date: 2006-01-09 changes: - update documentation (no code changes) version: 0.40 date: 2006-01-09 changes: - (bug 13561) support multiple encodings via 'encoding' attribute - (bug 13106) DokuWiki: rename from DocuWiki - (bug 13167) PmWiki: handle - (bug 13171) PmWiki: inline style rules fixed - (bug 14274) MediaWiki: better handling - (bug 14527) MediaWiki: better handling of
 blocks with empty lines
  - trim whitespace at end of each wiki line
  - html2wiki utility now installed by 'make install'
  - 'wrap_in_html' attribute now enabled by default in H::WC

version: 0.30
date:    2005-06-03
changes:
  - add support for WakkaWiki, SnipSnap, Oddmuse, and TikiWiki (see
    http://c2.com/cgi/wiki?TopTenWikiEngines)
  - add DokuWiki support for links, blockquotes, images, and tables
  - add 'strip_scripts', 'strip_head', and 'strip_comments' options,
    on by default
  - allow 'start' and 'end' rules to be specified with 'preserve' to
    allow for wiki escaping
  - allow reading from HTML files using "html2wiki( file => ... )"
  - (bug 12944, 12303) preserve HTML entities in text nodes
  - (bug 13017) MediaWiki should allow  and  to be preserved
    rather than converting to '' and '''
  - backend API changes; dialect modules now subclass HTML::WikiConverter
  - dialects specify their attributes using the attributes() method
  - more complete tests using larger input HTML file
  - document individual dialects
  - lots of documentation fixes

version: 0.23
date:    2005-05-22
changes:
  - add DokuWiki dialect (with thanks to Thomas J. Messenger)
  - (bug 12441) allow dialects to post-process wiki markup
  - (bug 12440) add 'empty' rule for elements with no content
  - (bug 12439) unknown HTML elements no longer ignored
  - (bug 12396) added command line tool bin/html2wiki
  - (bug 12395) MoinMoin: added postprocessing step in MoinMoin
    dialect to replace "URL[[BR]]" with "URL [[BR]]"
  - (bug 12433) MediaWiki: added complete set of whitelisted tags and
    attributes from Sanitizer.php

version: 0.22
date:    2005-04-25
changes:
  - (bug 12393) added 'wrap_in_html' option to wrap HTML input in
     and  prior to wiki conversion
  - (bug 12303) MediaWiki: bracketed content is nowiki-escaped if it
    resembles an external link reference
  - (bug 12438) PmWiki: add  whitespace trimming

version: 0.21
date:    2005-03-17
changes:
  - misc. documentation fixes
  - use '' as default return value from base_uri() method
  - add 'wiki_uri' parameter to specify how wiki links are formed
  - add get_attr_str() and get_wiki_page() utility methods
  - rename elem_contents() to get_elem_contents() for consistency
  - added support for PmWiki
  - fix postprocess whitespace trimming bug; only leading newlines and
    trailing whitespace is trimmed
  - add URI 1.35 requirement (though some earlier versions will
    probably work)
  - fix whitespace trimming in test suite
  - added new tests
  - Kwiki-specific changes:
      - add  as alias of 
      - add  preprocessing to convert it into a 

- update wiki link handling code to use get_wiki_page() - removed markup around CamelCase links - MediaWiki-specific changes: - use new get_wiki_page() utility method - no longer passes a full URI to basename(); uses URI->path() - added support for attributes in , , and
- MoinMoin-specific changes: - add
preprocessing as in Kwiki - make rules less redundant (use 'alias' rule) - PhpWiki-specific changes: - add preprocessing as in Kwiki - improve support for
- UseMod-specific changes: - add
preprocessing as in Kwiki version: 0.20 date: 2005-03-12 changes: - complete rewrite, significant API changes - added support for Kwiki, MediaWiki, MoinMoin, PhpWiki, and UseMod version: 0.17 date: 2004-07-07 changes: - update test suite - remove warnings reported by cpan testers - lots of documentation additions - MediaWiki changes: - 'wikify_span' now removes elements intended only for URL expansion (as used by the MonoBook skin) - 'wikify_link' does not wikify anchor tags (i.e. A tags must have an HREF attribute) - bug fix: table heading markup like "! bgcolor=black !" is now properly generated as "! bgcolor=black |" - TH now accepts a colspan and rowspan attribute - "colspan=1" attribute is now stripped from table cells that only span a single column. Likewise for "rowspan" - added "taxo_format" option to help format taxoboxes - align attribute is now preserved in TH and TD - add "add_nowiki" parameter for adding NOWIKI tags around {{messages}} - table caption handling using "|+" wiki table markup - better nested table handling; a newline is now added before the "{|" for nested tables - improved handling of image thumbnails - now uses warnings, strict version: 0.16 date: 2004-05-28 changes: - Added 'elem_style_attr_props' method for parsing the STYLE attribute - Tags that are meant to be stripped (e.g. HTML, META, HEAD) no longer need an empty replacement handler (they can simply be excluded from the list of tag handlers) - Added handling of non-breaking elements (e.g. in some dialects, such as MediaWiki, P tags should not have any embedded newlines) - Fixed whitespace handling in H::W::Dialect - Documentation additions - Added benchmarking option - MediaWiki: added colspan and rowspan to allowed TD attrs - MW: Now supports 'pretty_tables' option for making nice borders/shading - MW: Removed arbitrary 20-char limit in conversion of {{...}} magic - MW: "[1]"-style links are now handled properly - MW: Hex codes/HTML entities in URLs now translated to appropriate char (requires URI::Escape) - MW: Major list handling cleanup and bug fixes - MW: Added new realworld test (my User: page) version: 0.15 date: 2004-05-20 changes: - Split module into several separate modules - HTML::WikiConverter is still the main interface - Added support for wiki dialects via HTML::WIkiConverter::Dialect interface - Added HTML::WikiConverter::Dialect - Added HTML::WikiConverter::Dialect::MediaWiki - Added HTML::WikiConverter::Dialect::PhpWiki - Added HTML::WikiConverter::Dialect::Kwiki - Fixed spacing issues in tidy_whitespace - Added container, block, and line element handling - Now supports multiply-indented blocks version: 0.14 date: 2004-05-17 changes: - 'wikify_default' now called 'passthru' - NOWIKI blocks are no longer preserved - Bug fix: Add newline to HTML source before wikification -- avoids apparent bugs in HTML::TreeBuilder that prevent proper tag handling - Added trim method - Bug fix: Extra whitespace in PRE blocks no longer trimmed - Can now handle "[[language]]s" links (with trailing characters) version: 0.12 date: 2004-05-14 changes: - Bug fix: removed reference to non-existent 'has_parent' method within '_elem_has_ancestor' function - Bug fix: fixed potential bug in 'wikify_list_item' which used $node->parent->tag eq '...' instead of _elem_has_ancestor($node, '...') - Now supports definition lists via ; and : - Now supports indentation via : - Replaced code handler for P tag with flank handler - Replaced code handler for OL/UL tags with flank handlers - Renamed 'wikify_heading' method to 'wikify_h' for consistency with other wikify_* handlers - NOWIKI blocks are now preserved - Introduced beginnings of Unicode support with the use of HTML entities to HTML-escape high-bit characters - Added more tests to t/test.t version: 0.11 date: 2004-05-10 changes: - added wikify_default code handler for tags that should be preserved. - Added wikify_span - Now preserves FONT, SUP, and SUB tags - SPAN tags are converted into FONT if possible version: 0.10 date: 2004-05-07 changes: - Initial release HTML-WikiConverter-0.68/INSTALL0000644000076500000240000000635611154635541015357 0ustar diberristaffHTML::WikiConverter installation ================================ HTML::WikiConverter is an HTML to wiki converter capable of converting HTML source into a variety of wiki dialects. It's aimed at folks who are converting vanilla HTML websites into wikis. The HTML::WikiConverter module itself is an interface to its various dialect modules (e.g. HTML::WikiConverter::MediaWiki). To install the latest version of HTML::WikiConverter and all of its dialects, install Bundle::HTMLWikiConverter from CPAN: % perl -MCPAN -e 'install Bundle::HTMLWikiConverter' For just the interface: % perl -MCPAN -e 'install HTML::WikiConverter' Individual dialects can be installed separately, e.g.: % perl -MCPAN -e 'install HTML::WikiConverter::MediaWiki' % perl -MCPAN -e 'install HTML::WikiConverter::UseMod' % perl -MCPAN -e 'install HTML::WikiConverter::Oddmuse' Installing by hand ================== You may also obtain the source as a .tar.gz, in which case you can run the following after unpacking: % perl Makefile.PL % make && make test && make install The 'make install' step attempts to install the module and the html2wiki utility into something like /usr or /usr/local, which probably won't work unless you're an administrator. To work around this, you may install into your own local directory like so: % perl Makefile.PL PREFIX=~/myperl5 % make && make test && make install Installing the web application ============================== Included in this distribution is HTML::WikiConverter::WebApp, which is a CGI::Application-based web application that interfaces with HTML::WikiConverter and any installed dialects. (Note that you will need to have CGI::Application installed to use this feature.) The H::WC::WebApp module is installed automatically. The remaining files necessary for the web application are in the cgi/ directory. To install these files, first follow the basic installation instructions above. Next, copy the cgi/ directory to your cgi-bin/ directory, provide a suitable value for the 'template_path' variable in the provided index.cgi instance script, and assign appropriate permissions. These steps can all be accomplished via the included webapp-install script, which is excuted like so: % ./webapp-install When complete, be sure to stop and restart your Apache server: % sudo apachectl stop % sudo apachectl start If the webapp-install script does not work or you choose not to use it, you may also install the web application by hand. (This is essentially what the webapp-install script does.) 1) Copy the cgi/ directory to your cgi-bin/ directory: % cp -r cgi /var/www/cgi-bin/html2wiki 2) Assign appropriate permissions: % chmod 655 /var/www/cgi-bin/html2wiki/index.cgi % chmod a+r /var/www/cgi-bin/html2wiki/templates/* 3) Edit the index.cgi instance script to provide a suitable template path, replacing __PATH_TO_TEMPLATES__ with the location of the cgi/templates/ folder: # Inside /var/www/cgi-bin/html2wiki/index.cgi: my %config = ( template_path => '/var/www/cgi-bin/html2wiki/templates', ); 4) Restart your Apache server as appropriate. 5) The template filler should then be available via a standard URL: http://www.yourhost.com/cgi-bin/html2wiki/index.cgi Author ====== David J. Iberri HTML-WikiConverter-0.68/lib/0000755000076500000240000000000011161206511015047 5ustar diberristaffHTML-WikiConverter-0.68/lib/HTML/0000755000076500000240000000000011161206511015613 5ustar diberristaffHTML-WikiConverter-0.68/lib/HTML/WikiConverter/0000755000076500000240000000000011161206511020406 5ustar diberristaffHTML-WikiConverter-0.68/lib/HTML/WikiConverter/Dialects.pod0000644000076500000240000003442711014040625022653 0ustar diberristaff=head1 NAME HTML::WikiConverter::Dialects - How to add a dialect =head1 SYNOPSIS # In your dialect module: package HTML::WikiConverter::MySlimWiki; use base 'HTML::WikiConverter'; sub rules { { b => { start => '**', end => '**' }, i => { start => '//', end => '//' }, strong => { alias => 'b' }, em => { alias => 'i' }, hr => { replace => "\n----\n" } } } # In a nearby piece of code: package main; use Test::More tests => 5; my $wc = new HTML::WikiConverter( dialect => 'MySlimWiki' ); is( $wc->html2wiki( 'text' ), '**text**', b ); is( $wc->html2wiki( 'text' ), '//text//', i ); is( $wc->html2wiki( 'text' ), '**text**', 'strong' ); is( $wc->html2wiki( 'text' ), '//text//', 'em' ); is( $wc->html2wiki( '
' ), '----', 'hr' ); =head1 DESCRIPTION L (or H::WC, for short) is an HTML to wiki converter. It can convert HTML source into a variety of wiki markups, called wiki "dialects". This manual describes how you to create your own dialect to be plugged into HTML::WikiConverter. =head1 DIALECTS Each dialect has a separate dialect module containing rules for converting HTML into wiki markup specific for that dialect. Currently, all dialect modules are in the C package space and subclass HTML::WikiConverter. For example, the MediaWiki dialect module is L, while PhpWiki's is L. However, dialect modules need not be in the C package space; you may just as easily use C and H::WC will Do The Right Thing. From now on, I'll be using the terms "dialect" and "dialect module" interchangeably. =head2 Subclassing To interface with H::WC, dialects need to subclass it. This is done like so at the start of the dialect module: package HTML::WikiConverter::MySlimWiki; use base 'HTML::WikiConverter'; =head2 Conversion rules Dialects guide H::WC's conversion process with a set of rules that define how HTML elements are turned into their wiki counterparts. Each rule corresponds to an HTML tag and there may be any number of rules. Rules are specified in your dialect's C method, which returns a reference to a hash of rules. Each entry in the hash maps a tag name to a set of subrules, as in: $tag => \%subrules where C<$tag> is the name of the HTML tag (e.g., C<"b">, C<"em">, etc.) and C<%subrules> contains subrules that specify how that tag will be converted when it is encountered in the HTML input. =head3 Subrules The following subrules are recognized: start end preserve attributes empty replace alias block line_format line_prefix trim =head3 A simple example The following rules could be used for a dialect that uses C<*asterisks*> for bold and C<_underscores_> for italic text: sub rules { b => { start => '*', end => '*' }, i => { start => '_', end => '_' }, } =head3 Aliases To add CstrongE> and CemE> as aliases of CbE> and CiE>, use the C subrule: strong => { alias => 'b' }, em => { alias => 'i' }, (The C subrule cannot be used with any other subrule.) =head3 Blocks Many dialects separate paragraphs and other block-level elements with a blank line. To indicate this, use the C subrule: p => { block => 1 }, (To better support nested block elements, if a block elements are nested inside each other, blank lines are only added to the outermost element.) =head3 Line formatting Many dialects require that the text of an element be contained on a single line of text, or that it cannot contain any newlines, etc. These options can be specified using the C subrule, which can be assigned the value C<"single">, C<"multi">, or C<"blocks">. If the element must be contained on a single line, then the C subrule should be C<"single">. If the element can span multiple lines, but there can be no blank lines contained within, then use C<"multi">. If blank lines (which delimit blocks) are allowed, then use C<"blocks">. For example, paragraphs are specified like so in the MediaWiki dialect: p => { block => 1, line_format => 'multi', trim => 'both' }, =head3 Trimming whitespace The C subrule specifies whether leading or trailing whitespace (or both) should be stripped from the element. To strip leading whitespace only, use C<"leading">; for trailing whitespace, use C<"trailing">; for both, use the aptly named C<"both">; for neither (the default), use C<"none">. =head3 Line prefixes Some elements require that each line be prefixed with a particular string. This is specified with the C subrule. For example, preformatted text in MediaWiki is prefixed with a space: pre => { block => 1, line_prefix => ' ' }, =head3 Replacement In some cases, conversion from HTML to wiki markup is as simple as string replacement. To replace a tag and its contents with a particular string, use the C subrule. For example, in PhpWiki, three percent signs, C<"%%%">, represents a line break, CbrE>, hence: br => { replace => '%%%' }, (The C subrule cannot be used with any other subrule.) =head3 Preserving HTML tags Some dialects allow a subset of HTML in their markup. While H::WC ignores unhandled HTML tags by default (i.e., if H::WC encounters a tag that does not exist in a dialect's rule specification, then the contents of the tag is simply passed through to the wiki markup), you may specify that some be preserved using the C subrule. For example, to allow CfontE> tag in wiki markup: font => { preserve => 1 }, Preserved tags may also specify a list of attributes that may also passthrough from HTML to wiki markup. This is done with the C subrule: font => { preserve => 1, attributes => [ qw/ style class / ] }, (The C subrule can only be used if the C subrule is also present.) Some HTML elements have no content (e.g., line breaks, images) and the wiki dialect might require them to be preserved in a more XHTML-friendly way. To indicate that a preserved tag should have no content, use the C subrule. This will cause the element to be replaced with C<"Etag /E"> and no end tag. For example, MediaWiki handles line breaks like so: br => { preserve => 1, attributes => [ qw/ id class title style clear / ], empty => 1 }, This will convert, for example, C<"Ebr clear='both'E"> into C<"Ebr clear='both' /E">. Without specifying the C subrule, this would be converted into the (probably undesirable) C<"Ebr clear='both'EE/brE">. (The C subrule can only be used if the C subrule is also present.) =head3 Rules that depend on attribute values In some circumstances, you might want your dialect's conversion rules to depend on the value of one or more attributes. This can be achieved by producing rules in a conditional manner within C. For example: sub rules { my $self = shift; my %rules = ( em => { start => "''", end => "''" }, strong => { start => "'''", end => "'''" }, ); $rules{i} = { preserve => 1 } if $self->preserve_italic; $rules{b} = { preserve => 1 } if $self->preserve_bold; return \%rules; } =head2 Dynamic subrules Instead of simple strings, you may use coderefs as values for the C, C, C, and C subrules. If you do, the code will be called when the subrule is applied, and will be passed three arguments: the current H::WC object, the current L node being operated on, and a reference to the hash containing the dialect's subrules associated with elements of that type. For example, MoinMoin handles lists like so: ul => { line_format => 'multi', block => 1, line_prefix => ' ' }, li => { start => \&_li_start, trim => 'leading' }, ol => { alias => 'ul' }, It then defines C<_li_start()>: sub _li_start { my( $self, $node, $subrules ) = @_; my $bullet = ''; $bullet = '*' if $node->parent->tag eq 'ul'; $bullet = '1.' if $node->parent->tag eq 'ol'; return "\n$bullet "; } This prefixes every unordered list item with C<"*"> and every ordered list item with C<"1.">, which MoinMoin requires. It also puts each list item on its own line and places a space between the prefix and the content of the list item. =head2 Subrule validation Certain subrule combinations are not allowed. Hopefully it's intuitive why this is, but in case it's not, prohibited combinations have been mentioned above parenthetically. For example, the C and C subrules cannot be combined with any other subrules, and C can only be specified alongside C. Invalid subrule combinations will trigger a fatal error when the H::WC object is instantiated. =head2 Dialect attributes H::WC's constructor accepts a number of attributes that help determine how conversion takes place. Dialects can alter these attributes or add their own by defining an C method, which returns a reference to a hash of attributes. Each entry in the hash maps the attribute's name to an attribute specification, as in: $attr => \%spec where C<$attr> is the name of the attribute and C<%spec> is a L specification for the attribute. For example, to add a boolean attribute called C which is disabled by default: sub attributes { camel_case => { default => 0 }, } Attributes defined liks this are given accessor and mutator methods via Perl's C mechanism, so you can later say: my $ok = $wc->camel_case; $wc->camel_case(0); You may override the default H::WC attributes using this mechanism. For example, while H::WC considers the C attribute optional, it is required for the PbWiki dialect. PbWiki can override this default-optional behavior by saying: sub attributes { base_uri => { optional => 0 } } =head2 Preprocessing The first step H::WC takes in converting HTML source to wiki markup is to parse the HTML into a syntax tree using L. It is often useful for dialects to preprocess the tree prior to converting it into wiki markup. Dialects that need to preprocess the tree can define a C method that will be called on each node of the tree (traversal is done in pre-order). The method receives two arguments, the H::WC object, and the current L node being traversed. It may modify the node or decide to ignore it; its return value is discarded. =head3 Built-in preprocessors Because they are commonly needed, H::WC automatically carries out two preprocessing steps, regardless of the dialect: 1) relative URIs in images and links are converted to absolute URIs (based upon the C parameter), and 2) ignorable text (e.g. between a C/tdE> and CtdE>) is discarded. H::WC also provides additional preprocessing steps that may be explicitly enabled by dialect modules. =over =item strip_aname Removes any anchor elements that do not contain an C attribute. =item caption2para Removes table captions and reinserts them as paragraphs before the table. =back Dialects may apply these optional preprocessing steps by calling them as methods on the dialect object inside C. For example: sub preprocess_node { my( $self, $node ) = @_; $self->strip_aname($node); $self->caption2para($node); } =head2 Postprocessing Once the work of converting HTML is complete, it is sometimes useful to postprocess the resulting wiki markup. Postprocessing can be used to clean up whitespace, fix subtle bugs introduced in the markup during conversion, etc. Dialects that want to postprocess the wiki markup should define a C method that will be called just before the C method returns to the client. The method will be passed two arguments, the H::WC object and a reference to the wiki markup. The method may modify the wiki markup that the reference points to; its return value is discarded. For example, to replace a series of line breaks with a pair of newlines, a dialect might implement this: sub postprocess_output { my( $self, $outref ) = @_; $$outref =~ s/
\s*
/\n\n/gs; } (This example assumes that HTML line breaks were replaced with CbrE> in the wiki markup.) =head2 Dialect utility methods H::WC defines a set of utility methods that dialect modules may find useful. =head3 get_elem_contents my $wiki = $wc->get_elem_contents( $node ); Converts the contents of C<$node> into wiki markup and returns the resulting wiki markup. =head3 get_wiki_page my $title = $wc->get_wiki_page( $url ); Attempts to extract the title of a wiki page from the given URL, returning the title on success, C on failure. If C is empty, this method always return C. See L for details on how the C attribute is interpreted. =head3 is_camel_case my $ok = $wc->is_camel_case( $str ); Returns true if C<$str> is in CamelCase, false otherwise. CamelCase-ness is determined using the same rules that L's formatting module uses. =head3 get_attr_str my $attr_str = $wc->get_attr_str( $node, @attrs ); Returns a string containing the specified attributes in the given node. The returned string is suitable for insertion into an HTML tag. For example, if C<$node> contains the HTML