pax_global_header00006660000000000000000000000064136522713420014517gustar00rootroot0000000000000052 comment=cbc26f27c1d068ab018266df4a7dd3c5bd611bf8 libhtmlcleaner-java-2.24/000077500000000000000000000000001365227134200153525ustar00rootroot00000000000000libhtmlcleaner-java-2.24/.gitignore000066400000000000000000000001361365227134200173420ustar00rootroot00000000000000/bin /target /.settings /.classpath /.project /test-output pom.xml.versionsBackup *.swp *.jar libhtmlcleaner-java-2.24/.travis.yml000066400000000000000000000000171365227134200174610ustar00rootroot00000000000000language: java libhtmlcleaner-java-2.24/build.xml000066400000000000000000000127551365227134200172050ustar00rootroot00000000000000 HtmlCleaner libhtmlcleaner-java-2.24/config/000077500000000000000000000000001365227134200166175ustar00rootroot00000000000000libhtmlcleaner-java-2.24/config/MANIFEST.MF000066400000000000000000000001351365227134200202500ustar00rootroot00000000000000Manifest-Version: 1.0 Ant-Version: Apache Ant 1.8.4 Main-Class: org.htmlcleaner.CommandLinelibhtmlcleaner-java-2.24/doc/000077500000000000000000000000001365227134200161175ustar00rootroot00000000000000libhtmlcleaner-java-2.24/doc/release.txt000066400000000000000000000026261365227134200203060ustar00rootroot00000000000000Performing a release 1. Create and stage the Maven artefacts mvn clean deploy mvn release:clean mvn release:prepare mvn release:perform 2. Close the release Log into Sonatype, go to Staging Repositories, select the repository, and click Close 3. Verify everything is OK 4. Finalise the Maven release In Sonatype, select the repository, and click Release. 5. Create the file artefacts 5.1 Bin create a folder for the "bin" release add htmlcleaner-V.v.jar add htmlcleaner-V.v.jar.asc add license.txt Zip these up as htmlcleaner-V.v.zip 5.2 GUI create a folder for the "gui" release cd to the htmlcleaner-gui subproject update the pom.xml with the new version mvn clean package copy htmlcleaner-gui-V.v-with-dependencies.jar to the gui release folder rename to remove the "-with-dependencies" part of the file name gpg --armor --detach-sig htmlcleaner-gui-V.v.jar add license.txt Zip the jar, asc and license as htmlcleaner-gui-V.v.zip 5.3 Source create a folder for the "src" release svn export [svn location of tag] copy exported src folder into src release folder copy config, example.xml, license.txt. pom.xml into src release folder Zip as htmlcleaner-V.v-src.zip 6. Upload the zip files to Sourceforge Create a new folder for the new version, and upload all three zip files Select the htmlcleaner-V.v.zip file, and click Details Click "select all" and save to make it the default download 7. Update the websitelibhtmlcleaner-java-2.24/example.xml000066400000000000000000000577061365227134200175460ustar00rootroot00000000000000 bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml nobr a map area map bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font li,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml dt,dd dt,dd bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml tr,tbody,thead,tfoot,colgroup,caption,tr bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font tr,thead,tbody,tfoot,caption,colgroup,table,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml table tbody td,th thead,tfoot tr,td,th,caption,colgroup table tr td,th,caption,colgroup table tr td,th,caption,colgroup table tr,form td,th,tr,tbody,thead,tfoot,caption,colgroup table tr,form td,th,tr,tbody,thead,tfoot,caption,colgroup table tr,form td,th,tr,tbody,thead,tfoot,caption,colgroup colgroup table col td,th,tr,tbody,thead,tfoot,caption,colgroup table td,th,tr,tbody,thead,tfoot,caption,colgroup form bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml select,optgroup,option select,optgroup,option option,optgroup option,optgroup,select select option select option optgroup select,optgroup,option bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml u,i,tt,sub,sup,big,small,strike,blink,s b,u,tt,sub,sup,big,small,strike,blink,s b,i,tt,sub,sup,big,small,strike,blink,s b,u,i,sub,sup,big,small,strike,blink,s b,u,i,tt,sup,big,small,strike,blink,s b,u,i,tt,sub,big,small,strike,blink,s b,u,i,tt,sub,sup,small,strike,blink,s b,u,i,tt,sub,sup,big,strike,blink,s b,u,i,tt,sub,sup,big,small,blink,s b,u,i,tt,sub,sup,big,small,strike,s bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml b,u,i,tt,sub,sup,big,small,strike,blink bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml libhtmlcleaner-java-2.24/htmlcleaner-gui/000077500000000000000000000000001365227134200204325ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/config/000077500000000000000000000000001365227134200216775ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/config/MANIFEST.MF000066400000000000000000000001311365227134200233240ustar00rootroot00000000000000Manifest-Version: 1.0 Ant-Version: Apache Ant 1.8.4 Main-Class: org.htmlcleaner.GUI libhtmlcleaner-java-2.24/htmlcleaner-gui/pom.xml000066400000000000000000000173121365227134200217530ustar00rootroot00000000000000 4.0.0 net.sourceforge.htmlcleaner htmlcleaner-gui jar 2.23 HtmlCleaner GUI HtmlCleaner is an HTML parser written in Java. It transforms dirty HTML to well-formed XML following the same rules that most web-browsers use. HTMLCleaner GUI is a Swing application that can be used with it. http://htmlcleaner.sourceforge.net/ 2013 BSD License http://www.opensource.org/licenses/bsd-license.php OWNER = Marton Szeles YEAR = 2013-2015 repo Marton Szeles marlyy Developer 1 marlyy@users.sourceforge.net Vladimir Nikic vnikic Project Admin Developer 1 vnikic@users.sourceforge.net Patrick Moore patmoore Project Admin Developer -8 patmoore@farreach.es Scott Wilson scottbw Project Admin Developer 1 scottbw@apache.org 3.0.4 https://svn.code.sf.net/p/htmlcleaner/code/trunk/ scm:svn:https://svn.code.sf.net/p/htmlcleaner/code/trunk/ scm:svn:https://svn.code.sf.net/p/htmlcleaner/code/trunk/ UTF-8 net.sourceforge.htmlcleaner htmlcleaner ${project.version} maven-assembly-plugin org.htmlcleaner.GUI jar-with-dependencies make-assembly package single org.apache.maven.plugins maven-deploy-plugin 2.7 false org.apache.maven.plugins maven-source-plugin 2.2.1 attach-sources verify jar test-jar true org.apache.maven.plugins maven-jar-plugin 2.4 config/MANIFEST.MF org.apache.maven.plugins maven-compiler-plugin 3.0 1.5 1.5 UTF-8 true org.apache.maven.plugins maven-javadoc-plugin 2.9 attach-javadocs jar org.apache.maven.plugins maven-gpg-plugin 1.4 sign-artifacts verify sign 98654FBE org.sonatype.oss oss-parent 7 libhtmlcleaner-java-2.24/htmlcleaner-gui/src/000077500000000000000000000000001365227134200212215ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/src/main/000077500000000000000000000000001365227134200221455ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/src/main/java/000077500000000000000000000000001365227134200230665ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/src/main/java/org/000077500000000000000000000000001365227134200236555ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/src/main/java/org/htmlcleaner/000077500000000000000000000000001365227134200261535ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/src/main/java/org/htmlcleaner/GUI.java000066400000000000000000000500611365227134200274440ustar00rootroot00000000000000package org.htmlcleaner; /** Copyright (c) 2013, Marton Szeles All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **/ import javax.swing.*; import javax.swing.filechooser.FileNameExtensionFilter; import org.htmlcleaner.conditional.TagNodeEmptyContentCondition; import java.awt.event.*; import java.awt.*; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; public class GUI extends JFrame { private JTextField inputText; private JTextField outputText; private String inputSrc; private String outputSrc; private ImageIcon imgThisImg; private String whatYouSelected; private String whatYouSelected2; public GUI() throws MalformedURLException, IOException { Container container = getContentPane(); container.setLayout(null); //IMAGE JLabel lblImage = new JLabel(""); lblImage.setToolTipText("Visit homepage"); lblImage.setBounds(283, 11, 209, 52); getContentPane().add(lblImage); imgThisImg = new ImageIcon(this.getClass().getResource("images/logo.jpg")); lblImage.setIcon(imgThisImg); lblImage.addMouseListener(new MouseAdapter() { public void mouseEntered(MouseEvent me) { setCursor(new Cursor(Cursor.HAND_CURSOR)); } public void mouseExited(MouseEvent me) { setCursor(Cursor.getDefaultCursor()); } public void mouseClicked(MouseEvent clickLink) { try { Desktop.getDesktop().browse(new URL("http://htmlcleaner.sourceforge.net/index.php").toURI()); } catch (IOException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } } }); // INPUT source ------------ JLabel lblInputFile = new JLabel("Input File / URL:"); lblInputFile.setForeground(Color.RED); lblInputFile.setBounds(44, 96, 114, 14); // coordinates container.add(lblInputFile); inputText = new JTextField(); inputText.setToolTipText("Enter path of local HTML file or enter URL starting with \"http://\"l"); inputText.setBounds(168, 91, 399, 25); getContentPane().add(inputText); inputText.setColumns(10); // OUTPUT source ------------ JLabel lblOutputFile = new JLabel("Output File:"); lblOutputFile.setForeground(Color.RED); lblOutputFile.setBounds(44, 138, 114, 14); getContentPane().add(lblOutputFile); outputText = new JTextField(); outputText.setToolTipText("Enter the path of the output file"); outputText.setBounds(168, 133, 399, 25); getContentPane().add(outputText); outputText.setColumns(10); // filechoosers // FILECHOOSER INPUT BUTTON JButton btnChooseFile = new JButton("Choose File"); btnChooseFile.setToolTipText("Choose input file"); btnChooseFile.setBounds(588, 91, 126, 24); getContentPane().add(btnChooseFile); btnChooseFile.addMouseListener(new MouseAdapter() { public void mouseClicked(MouseEvent cleanStart) { JFileChooser chooser = new JFileChooser(); FileNameExtensionFilter filter = new FileNameExtensionFilter( "HTML & XML files", "html", "xml"); chooser.setFileFilter(filter); int returnVal = chooser.showOpenDialog(getParent()); if(returnVal == JFileChooser.APPROVE_OPTION) { whatYouSelected = chooser.getSelectedFile().getPath(); inputText.setText(whatYouSelected); String temp = whatYouSelected.substring(0, whatYouSelected.lastIndexOf('.')); outputText.setText(temp + ".xml"); } } }); // FILECHOOSER OUTPUT BUTTON JButton btnChooseFile2 = new JButton("Choose File"); btnChooseFile2.setToolTipText("Choose output file"); btnChooseFile2.setBounds(588, 134, 126, 23); getContentPane().add(btnChooseFile2); btnChooseFile2.addMouseListener(new MouseAdapter() { public void mouseClicked(MouseEvent cleanStart) { JFileChooser chooser = new JFileChooser(); FileNameExtensionFilter filter = new FileNameExtensionFilter( "HTML & XML files", "html", "xml"); chooser.setFileFilter(filter); int returnVal = chooser.showOpenDialog(getParent()); if(returnVal == JFileChooser.APPROVE_OPTION) { whatYouSelected2 = chooser.getSelectedFile().getPath(); outputText.setText(whatYouSelected2); } } }); //CLEAR FIELDS BUTTON JButton clearButton = new JButton("Clear fields"); clearButton.setToolTipText("Clear the input/output fields"); clearButton.setBackground(new Color(240, 128, 128)); clearButton.setForeground(new Color(128, 128, 128)); clearButton.setBounds(441, 171, 126, 14); getContentPane().add(clearButton); clearButton.addMouseListener(new MouseAdapter() { public void mouseClicked(MouseEvent clearFields) { inputText.setText(""); outputText.setText(""); } }); //PROPERTIES LABEL JLabel lblNewLabel = new JLabel("Properties"); lblNewLabel.setFont(new Font("Tahoma", Font.PLAIN, 18)); lblNewLabel.setBounds(29, 181, 154, 25); getContentPane().add(lblNewLabel); // Checkboxes final CleanerProperties props = new CleanerProperties(); //1 final JCheckBox setAdvancedXmlEscape = new JCheckBox( "Advanced XML-Escape"); setAdvancedXmlEscape.setBackground(new Color(255, 250, 250)); setAdvancedXmlEscape.setSelected(true); setAdvancedXmlEscape.setBounds(29, 222, 222, 25); getContentPane().add(setAdvancedXmlEscape); setAdvancedXmlEscape.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setAdvancedXmlEscape.isSelected()){ props.setAdvancedXmlEscape(true); }else{ props.setAdvancedXmlEscape(true); } } }); //2 final JCheckBox TranslateSpecialEntities = new JCheckBox( "Translate Special Entities"); TranslateSpecialEntities.setBackground(new Color(255, 250, 250)); TranslateSpecialEntities.setSelected(true); TranslateSpecialEntities.setBounds(29, 250, 222, 25); getContentPane().add(TranslateSpecialEntities); TranslateSpecialEntities.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (TranslateSpecialEntities.isSelected()){ props.setTranslateSpecialEntities(true); }else{ props.setTranslateSpecialEntities(false); } } }); //3 final JCheckBox setRecognizeUnicodeChars = new JCheckBox( "Recognize Unicode Chars"); setRecognizeUnicodeChars.setBackground(new Color(255, 250, 250)); setRecognizeUnicodeChars.setSelected(true); setRecognizeUnicodeChars.setBounds(29, 278, 222, 25); getContentPane().add(setRecognizeUnicodeChars); setRecognizeUnicodeChars.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setRecognizeUnicodeChars.isSelected()){ props.setRecognizeUnicodeChars(true); }else{ props.setRecognizeUnicodeChars(false); } } }); //4 final JCheckBox setUseCdataForScriptAndStyle = new JCheckBox( "CDATA for Script & Style"); setUseCdataForScriptAndStyle.setBackground(new Color(255, 250, 250)); setUseCdataForScriptAndStyle.setSelected(true); setUseCdataForScriptAndStyle.setBounds(29, 306, 222, 25); getContentPane().add(setUseCdataForScriptAndStyle); setUseCdataForScriptAndStyle.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setUseCdataForScriptAndStyle.isSelected()){ props.setUseCdataForScriptAndStyle(true); }else{ props.setUseCdataForScriptAndStyle(false); } } }); //5 final JCheckBox setOmitUnknownTags = new JCheckBox( "Omit Unknown Tags"); setOmitUnknownTags.setBackground(new Color(255, 250, 250)); setOmitUnknownTags.setBounds(29, 334, 222, 25); getContentPane().add(setOmitUnknownTags); setOmitUnknownTags.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setOmitUnknownTags.isSelected()){ props.setOmitUnknownTags(true); }else{ props.setOmitUnknownTags(false); } } }); //6 final JCheckBox setTreatUnknownTagsAsContent = new JCheckBox( "Unknown Tags as Content"); setTreatUnknownTagsAsContent.setBackground(new Color(255, 250, 250)); setTreatUnknownTagsAsContent.setBounds(29, 362, 222, 29); getContentPane().add(setTreatUnknownTagsAsContent); setTreatUnknownTagsAsContent.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setTreatUnknownTagsAsContent.isSelected()){ props.setTreatUnknownTagsAsContent(true); }else{ props.setTreatUnknownTagsAsContent(false); } } }); //7 final JCheckBox setOmitDeprecatedTags = new JCheckBox( "Omit Deprecated Tags"); setOmitDeprecatedTags.setBackground(new Color(255, 250, 250)); setOmitDeprecatedTags.setBounds(253, 221, 239, 25); getContentPane().add(setOmitDeprecatedTags); setOmitDeprecatedTags.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setOmitDeprecatedTags.isSelected()){ props.setOmitDeprecatedTags(true); }else{ props.setOmitDeprecatedTags(false); } } }); //8 final JCheckBox setTreatDeprecatedTagsAsContent = new JCheckBox( "Deprecated Tags as Content"); setTreatDeprecatedTagsAsContent.setBackground(new Color(255, 250, 250)); setTreatDeprecatedTagsAsContent.setBounds(253, 249, 239, 25); getContentPane().add(setTreatDeprecatedTagsAsContent); setTreatDeprecatedTagsAsContent.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setTreatDeprecatedTagsAsContent.isSelected()){ props.setTreatDeprecatedTagsAsContent(true); }else{ props.setTreatDeprecatedTagsAsContent(false); } } }); //9 final JCheckBox setOmitComments = new JCheckBox( "Omit Comments"); setOmitComments.setBackground(new Color(255, 250, 250)); setOmitComments.setBounds(253, 277, 239, 25); getContentPane().add(setOmitComments); setOmitComments.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setOmitComments.isSelected()){ props.setOmitComments(true); }else{ props.setOmitComments(false); } } }); //10 final JCheckBox setOmitXmlDeclaration = new JCheckBox( "Omit Xml Declaration"); setOmitXmlDeclaration.setBackground(new Color(255, 250, 250)); setOmitXmlDeclaration.setBounds(253, 305, 239, 25); getContentPane().add(setOmitXmlDeclaration); setOmitXmlDeclaration.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setOmitXmlDeclaration.isSelected()){ props.setOmitXmlDeclaration(true); }else{ props.setOmitXmlDeclaration(false); } } }); //11 final JCheckBox setOmitDoctypeDeclaration = new JCheckBox( "Omit Doctype Declaration"); setOmitDoctypeDeclaration.setBackground(new Color(255, 250, 250)); setOmitDoctypeDeclaration.setSelected(true); setOmitDoctypeDeclaration.setBounds(253, 333, 239, 25); getContentPane().add(setOmitDoctypeDeclaration); setOmitDoctypeDeclaration.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setOmitDoctypeDeclaration.isSelected()){ props.setOmitDoctypeDeclaration(true); }else{ props.setOmitDoctypeDeclaration(false); } } }); //12 final JCheckBox setUseEmptyElementTags = new JCheckBox( "Use Empty Element Tags"); setUseEmptyElementTags.setBackground(new Color(255, 250, 250)); setUseEmptyElementTags.setSelected(true); setUseEmptyElementTags.setBounds(253, 361, 239, 30); getContentPane().add(setUseEmptyElementTags); setUseEmptyElementTags.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setUseEmptyElementTags.isSelected()){ props.setUseEmptyElementTags(true); }else{ props.setUseEmptyElementTags(false); } } }); //13 final JCheckBox setAllowMultiWordAttributes = new JCheckBox( "Allow Multi-Word Attributes"); setAllowMultiWordAttributes.setBackground(new Color(255, 250, 250)); setAllowMultiWordAttributes.setSelected(true); setAllowMultiWordAttributes.setBounds(494, 222, 238, 25); getContentPane().add(setAllowMultiWordAttributes); setAllowMultiWordAttributes.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setAllowMultiWordAttributes.isSelected()){ props.setAllowMultiWordAttributes(true); }else{ props.setAllowMultiWordAttributes(false); } } }); //14 final JCheckBox setAllowHtmlInsideAttributes = new JCheckBox( "Allow HTML Inside Attributes"); setAllowHtmlInsideAttributes.setBackground(new Color(255, 250, 250)); setAllowHtmlInsideAttributes.setBounds(494, 250, 238, 25); getContentPane().add(setAllowHtmlInsideAttributes); setAllowHtmlInsideAttributes.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setAllowHtmlInsideAttributes.isSelected()){ props.setAllowHtmlInsideAttributes(true); }else{ props.setAllowHtmlInsideAttributes(false); } } }); //15 final JCheckBox setIgnoreQuestAndExclam = new JCheckBox( "Ignore Quest & Exclam"); setIgnoreQuestAndExclam.setBackground(new Color(255, 250, 250)); setIgnoreQuestAndExclam.setSelected(true); setIgnoreQuestAndExclam.setBounds(494, 306, 238, 28); getContentPane().add(setIgnoreQuestAndExclam); setIgnoreQuestAndExclam.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setIgnoreQuestAndExclam.isSelected()){ props.setIgnoreQuestAndExclam(true); }else{ props.setIgnoreQuestAndExclam(false); } } }); //16 final JCheckBox setNamespacesAware = new JCheckBox( "Namespaces Aware"); setNamespacesAware.setBackground(new Color(255, 250, 250)); setNamespacesAware.setBounds(494, 278, 238, 25); getContentPane().add(setNamespacesAware); setNamespacesAware.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent arg0) { if (setNamespacesAware.isSelected()){ props.setNamespacesAware(true); }else{ props.setNamespacesAware(false); } } }); //PROPERTIES link JLabel lblHelp = new JLabel("Settings behaviour explained"); lblHelp.setToolTipText("Cick here for details"); lblHelp.setBackground(new Color(255, 250, 250)); lblHelp.setForeground(Color.BLUE); lblHelp.setBounds(498, 372, 234, 29); getContentPane().add(lblHelp); lblHelp.addMouseListener(new MouseAdapter() { public void mouseEntered(MouseEvent me) { setCursor(new Cursor(Cursor.HAND_CURSOR)); } public void mouseExited(MouseEvent me) { setCursor(Cursor.getDefaultCursor()); } public void mouseClicked(MouseEvent clickLink) { try { Desktop.getDesktop().browse(new URL("http://htmlcleaner.sourceforge.net/parameters.php").toURI()); } catch (IOException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } } }); // CLEAN Button JButton btnClean = new JButton("Clean"); btnClean.setToolTipText("Start the cleaning: generate clean XML"); btnClean.setFont(new Font("Tahoma", Font.PLAIN, 18)); btnClean.setBounds(614, 412, 100, 25); getContentPane().add(btnClean); btnClean.addMouseListener(new MouseAdapter() { @SuppressWarnings("deprecation") public void mouseClicked(MouseEvent cleanStart) { inputSrc = inputText.getText(); outputSrc = outputText.getText(); TagNode tagNode; if ( inputSrc.startsWith("http://") || inputSrc.startsWith("https://") ) { // It's a URL try { if (props.getHtmlVersion()==4) tagNode = new HtmlCleaner(Html4TagProvider.INSTANCE,props).clean(new URL(inputSrc), "utf-8"); else tagNode = new HtmlCleaner(props).clean(new URL(inputSrc), "utf-8"); new PrettyXmlSerializer(props).writeToFile( // OUTPUT tagNode, outputSrc, "utf-8"); } catch (MalformedURLException e1) { e1.printStackTrace(); } catch (IOException e1) { e1.printStackTrace(); } }else{ // It's a FILE try { props.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(props.getTagInfoProvider())); if (props.getHtmlVersion()==4) tagNode = new HtmlCleaner(Html4TagProvider.INSTANCE,props).clean( // INPUT new File(inputSrc), "utf-8"); else tagNode = new HtmlCleaner(props).clean( // INPUT new File(inputSrc), "utf-8"); new PrettyXmlSerializer(props).writeToFile( // OUTPUT tagNode, outputSrc, "utf-8"); } catch (IOException e) { e.printStackTrace(); } } } }); setVisible(true); // displaying the window setSize(767, 497); // size setTitle("HTML Cleaner"); setIconImage(Toolkit.getDefaultToolkit().getImage("icon.png")); setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); // enabling exiting on X container.setBackground(Color.WHITE); JLabel lblUse = new JLabel("Use:"); lblUse.setBounds(498, 339, 27, 20); getContentPane().add(lblUse); String com[]={"Html 4","Html 5"}; JComboBox comboBox = new JComboBox(com); comboBox.setSelectedIndex(1); comboBox.setBounds(531, 334, 62, 25); getContentPane().add(comboBox); comboBox.addItemListener(new ItemListener() { @Override public void itemStateChanged(ItemEvent e) { if (e.getStateChange() == ItemEvent.SELECTED) { String item=(String) e.getItem(); if (item.compareTo("Html 4")==0){ props.setHtmlVersion(HtmlCleaner.HTML_4); } else{ props.setHtmlVersion(HtmlCleaner.HTML_5); } props.reset(); setOmitUnknownTags.setSelected(false); setTreatUnknownTagsAsContent.setSelected(false); setOmitDeprecatedTags.setSelected(false); setTreatDeprecatedTagsAsContent.setSelected(false); setOmitComments.setSelected(false); setOmitXmlDeclaration.setSelected(false); setAllowHtmlInsideAttributes.setSelected(false); setNamespacesAware.setSelected(false); } } }); } public static void main(String[] args) { java.awt.EventQueue.invokeLater(new Runnable() { public void run() { try { new GUI(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }); } } libhtmlcleaner-java-2.24/htmlcleaner-gui/src/main/resources/000077500000000000000000000000001365227134200241575ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/src/main/resources/org/000077500000000000000000000000001365227134200247465ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/src/main/resources/org/htmlcleaner/000077500000000000000000000000001365227134200272445ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/src/main/resources/org/htmlcleaner/images/000077500000000000000000000000001365227134200305115ustar00rootroot00000000000000libhtmlcleaner-java-2.24/htmlcleaner-gui/src/main/resources/org/htmlcleaner/images/logo.jpg000066400000000000000000000124321365227134200321550ustar00rootroot00000000000000JFIFHHC     C   5" }!1AQa"q2#BR$3br %&'()*456789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz w!1AQaq"2B #3Rbr $4%&'()*56789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz ?SNp>7+ޘo|Y[iV˙ȗ,ÌgԊo_i7 ?!) !t+:grδg||񇆴?}iA?C/lsb'xŋ7\OiʓI--́"RzvV3xWw*:.D,Af8UzHanђi'+̝zPJOvodv=^'Ӽͪj{HN hp˸gҼ/4 ΒnW-GtmS_llo[mcv~b'Ɩ&:)ϭoeWJ6&~[O#i"#T]zĊ#Jq 4[I?`f"YN {{y'iӕk|>dڷ.y}E;m"QeGco wt9ޞkC > nZP\{U/iL'g&Nv0$uXlDb*;(W9*kI_{ t/ߴ5}C?w2y~Vs߅xmYoᾷv^pSǣ7O7*v08S0|O#z}>>:[N|@ k>+hr:>H1)2F2#%!87xZ< W$$ОpHc}oHT~>x.%kfQt) y1gOuw6ֵx^}2M6Vr·̲4k/o@𕷊u7BY# fB<,{*{Ǘ_m[}C[ƝqH|m2H!WRQ^AZ-oIw|:/NV>\6\pp $x;hK_ [f*P) ,rbYTgw?hk_ڇ 7[xþΥ%Λj|; ˆ8ݏx]>Ogo|Jm<+m"ʷrbl 7HFs@]gWLu{:_y(_`|q 6kk|)٧᭧]vB6n5+]@L@i!:QmY'៍y7ʚ, `dPȽr>MsO[VeYbz) P_죭x-Ŀj$AQYO6ܬrpC)<3hOf |klF.px U^aҾGE6v7߶dBy2pG#yoĩ "xmgYpU2[.vOh?Ɨ^=mCc"[V ^,u}i6kHES!\rH+ҩ*Xo?q->gQWy/w/}~G3HN*?htu$m]j0\tk B0Y1N;s\t/VaV"H0>ΈyN>͎Qʴ,?=v:puiJnirw]Rhh=qOԭK0?j>3x{QO][JǑybE=^MIH~;Z[2v=:+өJr֒Z:i|-?yEaiEUFV⺃S$fPs{|!AE8L.3B=cu3.&qmGM"ΏHBNw`N|~);QF'k:c[rTc9^X(RFFCElKy 98k z1x(T& cNCQѴO6zO|u{24Fzɂy3Ǫ|1tA𥿅׋4@GqD~^t\ss`okx{úl_Yjp !rHq_>8ǯ |7C=g H6J{0qU5>I ŦÞ#k˯jr,K0$.IU(gQvek 3G`i7; %~$|k~Gmx!5}\2psxǾ >774XkePInB.v8_5_ML&86. FqQfco~?OB$mFω8~*b-/D$Y>*}e SygX[(Uyҩɾ 0}̚7kiftg -ĞlQ֝|4%홥=;w0t/_-Ǚ#o<8Vύ| ~?MvGg_)Sf䪍d~)|^ AtTGH8 (98w W^&#GCw<-Λ6kMZ-bDLa؞}( 1Jo୯/ՊJI*=k']&KdYog=p$$- I=L i3![7YbdrSh$_$8U$UӵMCZ,TXc8 NKJs)#;h=ef~,կm6͵+|=^۠PX%^p rVMyj|Q9::;)kflp9!$pC&QcfO,lgeu=Š] _6+lcM4kEC 2KV3?x߈w< ֕;OtEŲDd_f$bDu1E6uFs]z| u@4h`:Ka3@t8Oumy{VFT^#5 )~oS^u77KLrDjQӥ/_xo]m FUFI;?;&Ri] ]iCP[]c,"'gsrǮ+cVm4=2UomihK4Si'oDeȆ y$gO~#a^$@L'؄\*/ d&`m8ɬ(mM>c&7X,7cq=SzR׍lz͊r5AO`"./5M)XјM#t0o,g=umFS aG $nf UAqZs_>xqsO&&.$v{*H& ;x$M}^}c{e}k~Pk ,k1%7Eǿ⑻q_0T]IO ^G:Z'QxPqZZG.4:yfz M!ao0[Em|#F=bumVGO YGnRn2q RLvd%HnWV0AbI0=P0L`zQE`zQE0FEQ@ pF=xK` Q@#Q@FEm4A(hQ@Q@A(3FsEQElibhtmlcleaner-java-2.24/licence.txt000066400000000000000000000031321365227134200175140ustar00rootroot00000000000000Copyright (c) 2006-2019, the HTMLCleaner project All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Report any issues and contact the developers through Sourceforge at https://sourceforge.net/projects/htmlcleaner/libhtmlcleaner-java-2.24/pom.xml000066400000000000000000000202341365227134200166700ustar00rootroot00000000000000 4.0.0 net.sourceforge.htmlcleaner htmlcleaner bundle 2.24 HtmlCleaner HtmlCleaner is an HTML parser written in Java. It transforms dirty HTML to well-formed XML following the same rules that most web-browsers use. http://htmlcleaner.sourceforge.net/ 2006 BSD License http://www.opensource.org/licenses/bsd-license.php OWNER = Vladimir Nikic, Pat Moore and Scott Wilson YEAR = 2006-2015 repo Vladimir Nikic vnikic Project Admin Developer 1 vnikic@users.sourceforge.net Patrick Moore patmoore Project Admin Developer -8 patmoore@farreach.es Scott Wilson scottbw Project Admin Developer 1 scottbw@apache.org 3.0.4 https://svn.code.sf.net/p/htmlcleaner/code/tags/htmlcleaner-2.24 scm:svn:https://svn.code.sf.net/p/htmlcleaner/code/tags/htmlcleaner-2.24 scm:svn:https://svn.code.sf.net/p/htmlcleaner/code/tags/htmlcleaner-2.24 UTF-8 1.0-beta-7 org.jdom jdom2 2.0.5 org.apache.ant ant 1.9.0 provided true junit junit 4.11 test true org.apache.felix maven-bundle-plugin 3.0.1 true org.htmlcleaner.* org.htmlcleaner.CommandLine org.apache.maven.plugins maven-deploy-plugin 2.7 false org.apache.maven.plugins maven-source-plugin 2.2.1 attach-sources verify jar test-jar true org.apache.maven.plugins maven-jar-plugin 2.4 config/MANIFEST.MF org.apache.maven.plugins maven-compiler-plugin 3.0 1.5 1.5 UTF-8 true org.apache.maven.plugins maven-javadoc-plugin 2.9 attach-javadocs jar -Xdoclint:none org.apache.maven.plugins maven-gpg-plugin 1.4 sign-artifacts verify sign C9F09252 org.apache.maven.wagon wagon-ssh-external ${wagon-build} org.apache.maven.wagon wagon-ftp ${wagon-build} org.sonatype.oss oss-parent 7 libhtmlcleaner-java-2.24/src/000077500000000000000000000000001365227134200161415ustar00rootroot00000000000000libhtmlcleaner-java-2.24/src/main/000077500000000000000000000000001365227134200170655ustar00rootroot00000000000000libhtmlcleaner-java-2.24/src/main/java/000077500000000000000000000000001365227134200200065ustar00rootroot00000000000000libhtmlcleaner-java-2.24/src/main/java/org/000077500000000000000000000000001365227134200205755ustar00rootroot00000000000000libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/000077500000000000000000000000001365227134200230735ustar00rootroot00000000000000libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/AttributeTransformation.java000066400000000000000000000036201365227134200306310ustar00rootroot00000000000000/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. Additional work by Amplafi. -- All rights released. */ package org.htmlcleaner; public interface AttributeTransformation { boolean satisfy(String attName, String attValue); String getTemplate(); }libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/AttributeTransformationPatternImpl.java000066400000000000000000000057501365227134200330170ustar00rootroot00000000000000/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. Additional work by Amplafi. -- All rights released. */ package org.htmlcleaner; import java.util.regex.Pattern; public class AttributeTransformationPatternImpl implements AttributeTransformation { private final Pattern attNamePattern; private final Pattern attValuePattern; private final String template; public AttributeTransformationPatternImpl(Pattern attNamePattern, Pattern attValuePattern, String template) { this.attNamePattern = attNamePattern; this.attValuePattern = attValuePattern; this.template = template; } public AttributeTransformationPatternImpl(String attNamePattern, String attValuePattern, String template) { this.attNamePattern = attNamePattern ==null?null:Pattern.compile(attNamePattern); this.attValuePattern = attValuePattern == null? null: Pattern.compile(attValuePattern); this.template = template; } public boolean satisfy(String attName, String attValue) { if ( (attNamePattern == null || attNamePattern.matcher(attName).find()) && (attValuePattern ==null || attValuePattern.matcher(attValue).find())){ return true; } else { return false; } } /** * @return the template */ public String getTemplate() { return template; } }libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/BaseHtmlNode.java000066400000000000000000000014521365227134200262450ustar00rootroot00000000000000package org.htmlcleaner; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; import java.util.List; public class BaseHtmlNode extends BaseTokenImpl implements HtmlNode { protected TagNode parent; public List getSiblings(){ // // If this is a root node, return an empty list // if (this.parent == null) { return new ArrayList(); }; // // Otherwise, return all the children, including this node // return this.parent.getAllChildren(); } public TagNode getParent() { return parent; } public void setParent(TagNode parent) { this.parent = parent; } public void serialize(Serializer serializer, Writer writer) throws IOException { // TODO Auto-generated method stub } } libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/BaseToken.java000066400000000000000000000045271365227134200256210ustar00rootroot00000000000000/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.IOException; import java.io.Writer; /** *

* Base token interface. Tokens are individual entities recognized by HTML parser. *

*/ public interface BaseToken { public void serialize(Serializer serializer, Writer writer) throws IOException; /** * @return row in source html where the token was found */ public int getRow(); /** * @param row */ public void setRow(int row); /** * @return col in source html where the token was found */ public int getCol(); /** * @param col */ public void setCol(int col); } libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/BaseTokenImpl.java000066400000000000000000000013441365227134200264350ustar00rootroot00000000000000package org.htmlcleaner; /** * Base class for all tokens. Allows position tracking. * * @author Konstantin Burov (aectann@gmail.com) * */ public abstract class BaseTokenImpl implements BaseToken { private int row; private int col; protected BaseTokenImpl(){ } protected BaseTokenImpl(int row, int col) { this.row = row; this.col = col; } public int getRow() { return row; } public void setRow(int row) { this.row = row; } public int getCol() { return col; } public void setCol(int col) { this.col = col; } @Override public String toString() { return "(line="+getRow()+", col="+getCol()+")"; } } libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/BelongsTo.java000066400000000000000000000050621365227134200256350ustar00rootroot00000000000000/* Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; /** * @author patmoore * */ public enum BelongsTo { HEAD_AND_BODY("all"), HEAD("head"), BODY("body"); private final String dbCode; private BelongsTo(String dbCode) { this.dbCode =dbCode; } /** * @return the dbCode */ public String getDbCode() { return dbCode; } public static BelongsTo toValue(Object value) { BelongsTo result = null; if ( value instanceof BelongsTo) { result = (BelongsTo) value; } else if ( value != null ) { String dbCode = value.toString().trim(); for(BelongsTo belongsTo: BelongsTo.values()) { if ( belongsTo.getDbCode().equalsIgnoreCase(dbCode) || belongsTo.name().equalsIgnoreCase(dbCode)) { result = belongsTo; break; } } } return result; } } libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/BrowserCompactXmlSerializer.java000066400000000000000000000164431365227134200314130ustar00rootroot00000000000000/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; import java.util.List; import java.util.ListIterator; import java.util.StringTokenizer; /** *

* Browser compact XML serializer - creates resulting XML by stripping whitespaces wherever possible, * but preserving single whitespace where at least one exists. This behaviour is well suited * for web-browsers, which usually treat multiple whitespaces as single one, but make difference * between single whitespace and empty text. *

*/ public class BrowserCompactXmlSerializer extends XmlSerializer { private static final String PRE_TAG = "pre"; private static final String BR_TAG = "
"; private static final String LINE_BREAK = "\n"; public BrowserCompactXmlSerializer(CleanerProperties props) { super(props); } @Override protected void serialize(TagNode tagNode, Writer writer) throws IOException { serializeOpenTag(tagNode, writer, false); TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName()); String tagName = tagInfo!=null? tagInfo.getName() : null; List tagChildren = new ArrayList(tagNode.getAllChildren()); if (!isMinimizedTagSyntax(tagNode)) { ListIterator childrenIt = tagChildren.listIterator(); while (childrenIt.hasNext()) { Object item = childrenIt.next(); if (item != null) { if (item instanceof ContentNode && !PRE_TAG.equals(tagName)) { String content = ((ContentNode) item).getContent(); content = dontEscape(tagNode) ? content.replaceAll("]]>", "]]>") : escapeXml(content); content = content.replaceAll("^"+SpecialEntities.NON_BREAKABLE_SPACE+"+", " "); content = content.replaceAll(SpecialEntities.NON_BREAKABLE_SPACE+"+$", " "); boolean whitespaceAllowed = tagInfo != null && tagInfo.getDisplay().isLeadingAndEndWhitespacesAllowed(); boolean writeLeadingSpace = content.length() > 0 && (Character.isWhitespace(content.charAt(0))); boolean writeEndingSpace = content.length() > 1 && Character.isWhitespace(content.charAt(content.length() - 1)); content = content.trim(); if (content.length() != 0) { boolean hasPrevContent = false; int order = tagChildren.indexOf(item); if (order >= 2) { Object prev = tagChildren.get(order-1); hasPrevContent = isContentOrInline(prev); } if (writeLeadingSpace && (whitespaceAllowed || hasPrevContent)) { writer.write(' '); } StringTokenizer tokenizer = new StringTokenizer(content, LINE_BREAK, true); String prevToken = ""; while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); if (prevToken.equals(token) && prevToken.equals(LINE_BREAK)) { writer.write(BR_TAG); prevToken = ""; } else if (LINE_BREAK.equals(token)) { writer.write(' '); } else { writer.write(token.trim()); } prevToken = token; } boolean hasFollowingContent = false; if (childrenIt.hasNext()) { Object next = childrenIt.next(); hasFollowingContent = isContentOrInline(next); childrenIt.previous(); } if (writeEndingSpace && (whitespaceAllowed || hasFollowingContent)) { writer.write(' '); } } else{ childrenIt.remove(); } } else if(item instanceof ContentNode){ String content = ((ContentNode) item).getContent(); writer.write(content); } else if (item instanceof CommentNode) { String content = ((CommentNode) item).getCommentedContent().trim(); writer.write(content); } else { ((BaseToken)item).serialize(this, writer); } } } serializeEndTag(tagNode, writer, tagInfo != null && tagInfo.getDisplay().isAfterTagLineBreakNeeded()); } } private boolean isContentOrInline(Object node) { boolean result = false; if (node instanceof ContentNode) { result = true; } else if (node instanceof TagNode) { TagInfo nextInfo = props.getTagInfoProvider().getTagInfo(((TagNode) node).getName()); result = nextInfo != null && nextInfo.getDisplay() == Display.inline; } return result; } }libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/CData.java000066400000000000000000000050761365227134200247220ustar00rootroot00000000000000/* Copyright (c) 2006-2013, the HtmlCleaner Project All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.htmlcleaner; public class CData extends ContentNode implements HtmlNode { public static final String BEGIN_CDATA = ""; public static final String SAFE_BEGIN_CDATA = "/**/"; public static final String SAFE_BEGIN_CDATA_ALT = "//"; public CData(String content){ super(content); } public String getContentWithoutStartAndEndTokens(){ return this.content; } /* (non-Javadoc) * @see org.htmlcleaner.ContentNode#getContent() */ @Override public String getContent() { return getContentWithoutStartAndEndTokens(); } /* (non-Javadoc) * @see org.htmlcleaner.ContentNode#toString() */ @Override public String toString() { return getContentWithStartAndEndTokens(); } public String getContentWithStartAndEndTokens(){ return SAFE_BEGIN_CDATA + this.content + SAFE_END_CDATA; } } libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/CleanTimeValues.java000066400000000000000000000061341365227134200267630ustar00rootroot00000000000000/* Copyright (c) 2006-2013, HtmlCleaner Team (Vladimir Nikic, Pat Moore, Scott Wilson) All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.htmlcleaner; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Set; import java.util.Stack; import java.util.TreeSet; import org.htmlcleaner.HtmlCleaner.NestingState; import org.htmlcleaner.conditional.ITagNodeCondition; /** * This class is for thread-safe handling of private instance variables from HtmlCleaner */ class CleanTimeValues { boolean _headOpened = false; boolean _bodyOpened = false; @SuppressWarnings("rawtypes") Set _headTags = new LinkedHashSet(); @SuppressWarnings("rawtypes") Set allTags = new TreeSet(); transient Stack nestingStates = new Stack(); TagNode htmlNode; TagNode bodyNode; TagNode headNode; TagNode rootNode; Set pruneTagSet = new HashSet(); Set pruneNodeSet = new HashSet(); Set allowTagSet; /** * A stack of namespaces for currently open tags. Every xmlns declaration * on a tag adds another namespace to the stack, which is removed when the * tag is closed. In this way you can keep track of what namespace a tag * belongs to. */ transient Stack namespace = new Stack(); /** * A map of all the namespace prefixes and URIs declared within the document. * We use this to check whether any prefixes remain undeclared. */ transient HashMap namespaceMap = new HashMap(); }libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/CleanerProperties.java000066400000000000000000000526761365227134200274040ustar00rootroot00000000000000/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.StringTokenizer; import org.htmlcleaner.audit.ErrorType; import org.htmlcleaner.audit.HtmlModificationListener; import org.htmlcleaner.conditional.ITagNodeCondition; import org.htmlcleaner.conditional.TagNodeAutoGeneratedCondition; import org.htmlcleaner.conditional.TagNodeNameCondition; /** * Properties defining cleaner's behaviour */ public class CleanerProperties implements HtmlModificationListener{ // Force consistent cross-platform encoding ( mandatory for reliable server operation) public static final String DEFAULT_CHARSET = "UTF-8"; public static final String BOOL_ATT_SELF = "self"; public static final String BOOL_ATT_EMPTY = "empty"; public static final String BOOL_ATT_TRUE = "true"; private ITagInfoProvider tagInfoProvider; /** * If this parameter is set to true, ampersand sign (&) that proceeds valid XML character sequences (&XXX;) will not be escaped with &XXX; */ private boolean advancedXmlEscape; private String useCdataFor; private List useCdataForList; private boolean translateSpecialEntities; private boolean recognizeUnicodeChars; private boolean omitUnknownTags; private boolean treatUnknownTagsAsContent; private boolean omitDeprecatedTags; private boolean omitComments; private boolean treatDeprecatedTagsAsContent; private OptionalOutput omitXmlDeclaration; private OptionalOutput omitDoctypeDeclaration; private OptionalOutput omitHtmlEnvelope; private boolean useEmptyElementTags; private boolean allowMultiWordAttributes; private String booleanAttributeValues; private boolean ignoreQuestAndExclam; private boolean allowHtmlInsideAttributes; private boolean namespacesAware; private boolean transSpecialEntitiesToNCR; private boolean omitCdataOutsideScriptAndStyle; private boolean deserializeEntities; private boolean trimAttributeValues; private int htmlVersion; private boolean allowInvalidAttributeNames; private String invalidAttributeNamePrefix; /** * "cause the cleaner cannot keep track of whitespace at that level", * there are 2 lists built: one for the head , one for the body. So whitespace that falls outside of the head and body is not preserved * this creates at least a newline break. * * More work than really wanted at this point to "preserve" the whitespace. */ private boolean addNewlineToHeadAndBody; /** * Tries to keep inside head all whitespace and comments that were originally there */ private boolean keepWhitespaceAndCommentsInHead; private String hyphenReplacementInComment; // comma separate list of tags pruned. private String pruneTags; // comma separate list of tags allowed. private String allowTags; private CleanerTransformations cleanerTransformations = new CleanerTransformations(); private List < HtmlModificationListener > htmlModificationListeners; /** * blacklist of tags */ private Set pruneTagSet = new HashSet(); /** * the list of allowed tags (whitelist approach v. blacklist approach of pruneTags ) */ private Set allowTagSet = new HashSet(); private String charset = DEFAULT_CHARSET; private boolean transResCharsToNCR; public CleanerProperties() { reset(); } /** * @param tagInfoProvider */ public CleanerProperties(ITagInfoProvider tagInfoProvider) { reset(); this.tagInfoProvider = tagInfoProvider; } /** * @param tagInfoProvider the tagInfoProvider to set */ void setTagInfoProvider(ITagInfoProvider tagInfoProvider) { this.tagInfoProvider = tagInfoProvider; } public ITagInfoProvider getTagInfoProvider() { return tagInfoProvider; } public boolean isAdvancedXmlEscape() { return advancedXmlEscape; } public void setAdvancedXmlEscape(boolean advancedXmlEscape) { this.advancedXmlEscape = advancedXmlEscape; } public boolean isTransResCharsToNCR() { return transResCharsToNCR; } public void setTransResCharsToNCR(boolean transResCharsToNCR) { this.transResCharsToNCR = transResCharsToNCR; } public boolean isUseCdataForScriptAndStyle() { return isUseCdataFor("script") && isUseCdataFor("style"); } public void setUseCdataForScriptAndStyle(boolean useCdataForScriptAndStyle) { if (useCdataForScriptAndStyle) setUseCdataFor("script,style"); else setUseCdataFor(""); } public void setUseCdataFor(String useCdataFor) { if (useCdataFor != null) { this.useCdataFor = useCdataFor; this.useCdataForList = Arrays.asList(useCdataFor.toLowerCase().split(",")); } else { this.useCdataFor = ""; this.useCdataForList = null; } } public String getUseCdataFor() { return this.useCdataFor; } public boolean isUseCdataFor(String useCdataFor) { if (useCdataForList != null && useCdataFor != null) return useCdataForList.contains(useCdataFor.toLowerCase()); else return false; } public boolean isTranslateSpecialEntities() { return translateSpecialEntities; } /** * TODO : use {@link OptionalOutput} * @param translateSpecialEntities */ public void setTranslateSpecialEntities(boolean translateSpecialEntities) { this.translateSpecialEntities = translateSpecialEntities; } public boolean isRecognizeUnicodeChars() { return recognizeUnicodeChars; } public void setRecognizeUnicodeChars(boolean recognizeUnicodeChars) { this.recognizeUnicodeChars = recognizeUnicodeChars; } public boolean isOmitUnknownTags() { return omitUnknownTags; } public void setOmitUnknownTags(boolean omitUnknownTags) { this.omitUnknownTags = omitUnknownTags; } public boolean isTreatUnknownTagsAsContent() { return treatUnknownTagsAsContent; } public void setTreatUnknownTagsAsContent(boolean treatUnknownTagsAsContent) { this.treatUnknownTagsAsContent = treatUnknownTagsAsContent; } public boolean isOmitDeprecatedTags() { return omitDeprecatedTags; } public void setOmitDeprecatedTags(boolean omitDeprecatedTags) { this.omitDeprecatedTags = omitDeprecatedTags; } public boolean isTreatDeprecatedTagsAsContent() { return treatDeprecatedTagsAsContent; } public void setTreatDeprecatedTagsAsContent(boolean treatDeprecatedTagsAsContent) { this.treatDeprecatedTagsAsContent = treatDeprecatedTagsAsContent; } public boolean isOmitComments() { return omitComments; } public void setOmitComments(boolean omitComments) { this.omitComments = omitComments; } public boolean isOmitXmlDeclaration() { return omitXmlDeclaration == OptionalOutput.omit; } public void setOmitXmlDeclaration(boolean omitXmlDeclaration) { this.omitXmlDeclaration = omitXmlDeclaration?OptionalOutput.omit:OptionalOutput.alwaysOutput; } /** * * @return also return true if omitting the Html Envelope */ public boolean isOmitDoctypeDeclaration() { return omitDoctypeDeclaration == OptionalOutput.omit || isOmitHtmlEnvelope(); } public void setOmitDoctypeDeclaration(boolean omitDoctypeDeclaration) { this.omitDoctypeDeclaration = omitDoctypeDeclaration?OptionalOutput.omit:OptionalOutput.alwaysOutput; } public boolean isOmitHtmlEnvelope() { return omitHtmlEnvelope == OptionalOutput.omit; } public void setOmitHtmlEnvelope(boolean omitHtmlEnvelope) { this.omitHtmlEnvelope = omitHtmlEnvelope?OptionalOutput.omit:OptionalOutput.alwaysOutput; } public boolean isUseEmptyElementTags() { return useEmptyElementTags; } public void setUseEmptyElementTags(boolean useEmptyElementTags) { this.useEmptyElementTags = useEmptyElementTags; } public boolean isAllowMultiWordAttributes() { return allowMultiWordAttributes; } public void setAllowMultiWordAttributes(boolean allowMultiWordAttributes) { this.allowMultiWordAttributes = allowMultiWordAttributes; } public boolean isAllowHtmlInsideAttributes() { return allowHtmlInsideAttributes; } public void setAllowHtmlInsideAttributes(boolean allowHtmlInsideAttributes) { this.allowHtmlInsideAttributes = allowHtmlInsideAttributes; } public boolean isIgnoreQuestAndExclam() { return ignoreQuestAndExclam; } public void setIgnoreQuestAndExclam(boolean ignoreQuestAndExclam) { this.ignoreQuestAndExclam = ignoreQuestAndExclam; } public boolean isNamespacesAware() { return namespacesAware; } public void setNamespacesAware(boolean namespacesAware) { this.namespacesAware = namespacesAware; } public boolean isAddNewlineToHeadAndBody() { return addNewlineToHeadAndBody; } public void setAddNewlineToHeadAndBody(boolean addNewlineToHeadAndBody) { this.addNewlineToHeadAndBody = addNewlineToHeadAndBody; } public boolean isKeepWhitespaceAndCommentsInHead() { return keepWhitespaceAndCommentsInHead; } public void setKeepWhitespaceAndCommentsInHead(boolean keepHeadWhitespace) { this.keepWhitespaceAndCommentsInHead = keepHeadWhitespace; } public String getHyphenReplacementInComment() { return hyphenReplacementInComment; } public void setHyphenReplacementInComment(String hyphenReplacementInComment) { this.hyphenReplacementInComment = hyphenReplacementInComment; } public String getPruneTags() { return pruneTags; } public boolean isOmitCdataOutsideScriptAndStyle(){ return omitCdataOutsideScriptAndStyle; } public void setOmitCdataOutsideScriptAndStyle(boolean value){ omitCdataOutsideScriptAndStyle = value; } public boolean isDeserializeEntities() { return deserializeEntities; } public void setDeserializeEntities(boolean deserializeEntities) { this.deserializeEntities = deserializeEntities; } /** * Sets the html version according to the parameter.Also,it sets the * tag provider to the appropriate version. * * @param version Number 4 for html4 or 5 for html5 */ public void setHtmlVersion(int version){ this.htmlVersion=version; if (version==4) this.setTagInfoProvider(Html4TagProvider.INSTANCE); else this.setTagInfoProvider(Html5TagProvider.INSTANCE); } /** * Return the html version * @return int The html version */ public int getHtmlVersion (){ return this.htmlVersion; } public boolean isTrimAttributeValues() { return trimAttributeValues; } public void setTrimAttributeValues(boolean trimAttributeValues) { this.trimAttributeValues = trimAttributeValues; } /** * Resets prune tags set and adds tag name conditions to it. * All the tags listed by pruneTags param are added. * * @param pruneTags */ public void setPruneTags(String pruneTags) { this.pruneTags = pruneTags; this.resetPruneTagSet(); this.addTagNameConditions(this.pruneTagSet, pruneTags); } /** * Adds the condition to existing prune tag set. * * @param condition */ public void addPruneTagNodeCondition(ITagNodeCondition condition){ pruneTagSet.add(condition); } public Set getPruneTagSet() { return pruneTagSet; } public String getAllowTags() { return allowTags; } public void setAllowTags(String allowTags) { this.allowTags = allowTags; this.setAllowTagSet(allowTags); } private void setAllowTagSet(String allowTags) { allowTagSet.clear(); addTagNameConditions(allowTagSet, allowTags); } public boolean isTransSpecialEntitiesToNCR() { return transSpecialEntitiesToNCR; } public void setTransSpecialEntitiesToNCR(boolean transSpecialEntitiesToNCR) { this.transSpecialEntitiesToNCR = transSpecialEntitiesToNCR; } /** * @param tagSet * @param tagsNameStr */ private void addTagNameConditions(Set tagSet, String tagsNameStr) { if (tagsNameStr != null) { StringTokenizer tokenizer = new StringTokenizer(tagsNameStr, ","); while ( tokenizer.hasMoreTokens() ) { tagSet.add( new TagNodeNameCondition(tokenizer.nextToken().trim().toLowerCase()) ); } } } public Set getAllowTagSet() { return allowTagSet; } /** * @param charset the charset to set */ public void setCharset(String charset) { this.charset = charset; } /** * @return the charset */ public String getCharset() { return charset; } public String getBooleanAttributeValues() { return booleanAttributeValues; } public void setBooleanAttributeValues(String booleanAttributeValues) { if ( BOOL_ATT_SELF.equalsIgnoreCase(booleanAttributeValues) || BOOL_ATT_EMPTY.equalsIgnoreCase(booleanAttributeValues) || BOOL_ATT_TRUE.equalsIgnoreCase(booleanAttributeValues) ) { this.booleanAttributeValues = booleanAttributeValues.toLowerCase(); } else { this.booleanAttributeValues = BOOL_ATT_SELF; } } /** * advancedXmlEscape = true; * setUseCdataFor("script,style"); * translateSpecialEntities = true; * recognizeUnicodeChars = true; * omitUnknownTags = false; * treatUnknownTagsAsContent = false; * omitDeprecatedTags = false; * treatDeprecatedTagsAsContent = false; * omitComments = false; * omitXmlDeclaration = OptionalOutput.alwaysOutput; * omitDoctypeDeclaration = OptionalOutput.alwaysOutput; * omitHtmlEnvelope = OptionalOutput.alwaysOutput; * useEmptyElementTags = true; * allowMultiWordAttributes = true; * allowHtmlInsideAttributes = false; * ignoreQuestAndExclam = true; * namespacesAware = true; * keepHeadWhitespace = true; * addNewlineToHeadAndBody = true; * hyphenReplacementInComment = "="; * pruneTags = null; * allowTags = null; * booleanAttributeValues = BOOL_ATT_SELF; * collapseNullHtml = CollapseHtml.none * charset = "UTF-8"; * trimAttributeValues = true; * tagInfoProvider = HTML5TagProvider.INSTANCE */ public void reset() { advancedXmlEscape = true; setUseCdataFor("script,style"); translateSpecialEntities = true; recognizeUnicodeChars = true; omitUnknownTags = false; treatUnknownTagsAsContent = false; omitDeprecatedTags = false; treatDeprecatedTagsAsContent = false; omitComments = false; omitXmlDeclaration = OptionalOutput.alwaysOutput; omitDoctypeDeclaration = OptionalOutput.alwaysOutput; omitHtmlEnvelope = OptionalOutput.alwaysOutput; useEmptyElementTags = true; allowMultiWordAttributes = true; allowHtmlInsideAttributes = false; ignoreQuestAndExclam = true; namespacesAware = true; addNewlineToHeadAndBody = true; keepWhitespaceAndCommentsInHead = true; hyphenReplacementInComment = "="; setPruneTags(null); setAllowTags(null); booleanAttributeValues = BOOL_ATT_SELF; charset = "UTF-8"; cleanerTransformations.clear(); resetPruneTagSet(); if (this.getHtmlVersion()==HtmlCleaner.HTML_4){ tagInfoProvider = Html4TagProvider.INSTANCE; } else{ tagInfoProvider = Html5TagProvider.INSTANCE; } htmlModificationListeners = new ArrayList < HtmlModificationListener >(); omitCdataOutsideScriptAndStyle = false; trimAttributeValues = true; invalidAttributeNamePrefix = ""; allowInvalidAttributeNames = false; } private void resetPruneTagSet() { pruneTagSet.clear(); pruneTagSet.add(TagNodeAutoGeneratedCondition.INSTANCE); } /** * @return the cleanerTransformations */ public CleanerTransformations getCleanerTransformations() { return cleanerTransformations; } public void setCleanerTransformations(CleanerTransformations cleanerTransformations) { if ( cleanerTransformations == null ) { this.cleanerTransformations.clear(); } else { this.cleanerTransformations = cleanerTransformations; } } /** * Adds a listener to the list of objects that will be notified about changes that * cleaner does during cleanup process. * * @param listener -- listener object to be notified of the changes. */ public void addHtmlModificationListener(HtmlModificationListener listener){ htmlModificationListeners.add(listener); } public void fireConditionModification(ITagNodeCondition condition, TagNode tagNode) { for (HtmlModificationListener listener : htmlModificationListeners) { listener.fireConditionModification(condition, tagNode); } } public void fireHtmlError(boolean certainty, TagNode startTagToken, ErrorType type) { for (HtmlModificationListener listener : htmlModificationListeners) { listener.fireHtmlError(certainty, startTagToken, type); } } public void fireUglyHtml(boolean certainty, TagNode startTagToken, ErrorType errorType) { for (HtmlModificationListener listener : htmlModificationListeners) { listener.fireUglyHtml(certainty, startTagToken, errorType); } } public void fireUserDefinedModification(boolean certainty, TagNode tagNode, ErrorType errorType) { for (HtmlModificationListener listener : htmlModificationListeners) { listener.fireUserDefinedModification(certainty, tagNode, errorType); } } /** * Get the prefix to use to try to make valid attribute names * @return */ public String getInvalidXmlAttributeNamePrefix() { return invalidAttributeNamePrefix; } /** * Sets the prefix to use for xml attributes that are invalid * @param invalidXmlAttributePrefix */ public void setInvalidXmlAttributeNamePrefix( String invalidXmlAttributePrefix) { this.invalidAttributeNamePrefix = invalidXmlAttributePrefix; } /** * Set whether to allow invalid attribute names, or to try to fix or omit them * @param allowInvalidAttributeNames */ public void setAllowInvalidAttributeNames( boolean allowInvalidAttributeNames) { this.allowInvalidAttributeNames = allowInvalidAttributeNames; } /** * If false, when outputting XML, if an attribute name is not valid, attempt to * fix it by using a prefix and removing invalid characters. Otherwise, omit invalid attributes * @return */ public boolean isAllowInvalidAttributeNames() { return allowInvalidAttributeNames; } } libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/CleanerTransformations.java000066400000000000000000000132441365227134200304250ustar00rootroot00000000000000/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.util.HashMap; import java.util.Iterator; import java.util.Map; /** * Contains transformation collection. */ public class CleanerTransformations { private Map mappings = new HashMap(); private TagTransformation globalTransformations=new TagTransformation(); public CleanerTransformations() { } /** * @param transInfos */ public CleanerTransformations(Map transInfos) { updateTagTransformations(transInfos); } /** * Adds specified tag transformation to the collection. * @param tagTransformation */ public void addTransformation(TagTransformation tagTransformation) { if (tagTransformation != null) { mappings.put( tagTransformation.getSourceTag(), tagTransformation ); } } public void addGlobalTransformation(AttributeTransformation attributeTransformation) { globalTransformations.addAttributePatternTransformation(attributeTransformation); } public boolean hasTransformationForTag(String tagName) { return tagName != null && mappings.containsKey(tagName.toLowerCase()); } public TagTransformation getTransformation(String tagName) { return tagName != null ? (TagTransformation) mappings.get(tagName.toLowerCase()) : null; } public void updateTagTransformations(String key, String value) { int index = key.indexOf('.'); // new tag transformation case (tagname[=destname[,preserveatts]]) if (index <= 0) { String destTag = null; boolean preserveSourceAtts = true; if (value != null) { String[] tokens = Utils.tokenize(value, ",;"); if (tokens.length > 0) { destTag = tokens[0]; } if (tokens.length > 1) { preserveSourceAtts = "true".equalsIgnoreCase(tokens[1]) || "yes".equalsIgnoreCase(tokens[1]) || "1".equals(tokens[1]); } } TagTransformation newTagTrans = new TagTransformation(key, destTag, preserveSourceAtts); addTransformation(newTagTrans); } else { // attribute transformation description String[] parts = Utils.tokenize(key, "."); String tagName = parts[0]; TagTransformation trans = getTransformation(tagName); if (trans != null) { trans.addAttributeTransformation(parts[1], value); } } } public void updateTagTransformations(Map transInfos) { Iterator iterator = transInfos.entrySet().iterator(); while (iterator.hasNext()) { Map.Entry entry = (Map.Entry) iterator.next(); String tag = (String) entry.getKey(); String value = (String) entry.getValue(); updateTagTransformations(tag, value); } } public Map transformAttributes(String originalTagName, Map attributes) { TagTransformation tagTrans = getTransformation(originalTagName); Map results; if ( tagTrans != null ) { results = tagTrans.applyTagTransformations(attributes); } else { results = attributes; } return this.globalTransformations.applyTagTransformations(results); } public String getTagName(String tagName) { TagTransformation tagTransformation = null; if (hasTransformationForTag(tagName)) { tagTransformation = getTransformation(tagName); if (tagTransformation != null) { return tagTransformation.getDestTag(); } } return tagName; } /** * */ public void clear() { this.mappings.clear(); } }libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/CloseTag.java000066400000000000000000000053701365227134200254440ustar00rootroot00000000000000/* All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. Additional work by Amplafi. -- All rights released. */ package org.htmlcleaner; /** * @author patmoore * */ public enum CloseTag { /** *
is required. Minimizing to
is not permitted. */ required(false, true), /** *
or
is permitted */ optional(true, true), /** * is not permitted */ forbidden(true, false); private final boolean minimizedTagPermitted; private final boolean endTagPermitted; /** * * @param minimizedTagPermitted if true tag can be reduced to * @param endTagPermitted TODO */ private CloseTag(boolean minimizedTagPermitted, boolean endTagPermitted) { this.minimizedTagPermitted = minimizedTagPermitted; this.endTagPermitted =endTagPermitted; } /** * @return true if form is allowed */ public boolean isMinimizedTagPermitted() { return this.minimizedTagPermitted; } /** * @return true if or is permitted. */ public boolean isEndTagPermitted() { return endTagPermitted; } } libhtmlcleaner-java-2.24/src/main/java/org/htmlcleaner/CommandLine.java000066400000000000000000000404421365227134200261300ustar00rootroot00000000000000/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.FileOutputStream; import java.net.URL; import java.util.Map; import java.util.Scanner; import java.util.TreeMap; import java.util.logging.Logger; import org.htmlcleaner.audit.HtmlModificationListenerLogger; /** *

Command line usage class.

*/ public class CommandLine { private static final String OMITXMLDECL = "omitxmldecl"; /** * If the specified argument name exists without a value, return true. * If it exists with a value, translate it as a boolean. * @param args the command line arguments * @param name the switch name * @return true, or false, depending on whether the switch has been specified */ private static boolean getSwitchArgument(String[] args, String name){ boolean value = false; for (String curr : args){ int eqIndex = curr.indexOf('='); if (eqIndex >= 0) { String argName = curr.substring(0, eqIndex).trim(); String argValue = curr.substring(eqIndex+1).trim(); if (argName.toLowerCase().startsWith(name.toLowerCase())) { value = toBoolean(argValue); } } else { value = true; } } return value; } private static String getArgValue(String[] args, String name, String defaultValue) { for (String curr : args) { int eqIndex = curr.indexOf('='); if (eqIndex >= 0) { String argName = curr.substring(0, eqIndex).trim(); String argValue = curr.substring(eqIndex+1).trim(); if (argName.toLowerCase().startsWith(name.toLowerCase())) { return argValue; } } } return defaultValue; } private static boolean toBoolean(String s) { return s != null && ( "on".equalsIgnoreCase(s) || "true".equalsIgnoreCase(s) || "yes".equalsIgnoreCase(s) ); } private final static String className = CommandLine.class.getName(); private final static Logger logger = Logger.getLogger(className); public static void main(String[] args) throws IOException, XPatherException { String source = getArgValue(args, "src", ""); Scanner scan = new Scanner(System.in); String s = ""; if ( "".equals(source) ) { while (scan.hasNext()) { s += scan.nextLine(); } if (s.compareTo("") != 0) { System.err.println("Output:"); } else { System.err.println("Usage: java -jar htmlcleanerXX.jar src= [htmlver=4] [incharset=] " + "[dest=] [outcharset=] [taginfofile=] [options...]"); System.err.println("Alternative: java -jar htmlcleanerXX.jar (reads the input from console)"); System.err.println(""); System.err.println("where options include:"); System.err.println(" outputtype=simple* | compact | browser-compact | pretty"); System.err.println(" advancedxmlescape=true* | false"); System.err.println(" usecdata=true* | false"); System.err.println(" usecdatafor= [script,style]"); System.err.println(" specialentities=true* | false"); System.err.println(" unicodechars=true* | false"); System.err.println(" omitunknowntags=true | false*"); System.err.println(" treatunknowntagsascontent=true | false*"); System.err.println(" omitdeprtags=true | false*"); System.err.println(" treatdeprtagsascontent=true | false*"); System.err.println(" omitcomments=true | false*"); System.err.println(" " +OMITXMLDECL +"=true* | false"); System.err.println(" omitdoctypedecl=true* | false"); System.err.println(" omithtmlenvelope=true | false*"); System.err.println(" useemptyelementtags=true* | false"); System.err.println(" allowmultiwordattributes=true* | false"); System.err.println(" allowhtmlinsideattributes=true | false*"); System.err.println(" ignoreqe=true | false*"); System.err.println(" namespacesaware=true* | false"); System.err.println(" hyphenreplacement= [=]"); System.err.println(" prunetags= []"); System.err.println(" booleanatts=self* | empty | true"); System.err.println(" nodebyxpath="); System.err.println(" allowinvalidxmlattributenames=true | false*"); System.err.println(" invalidxmlattributenameprefix= []"); System.err.println(" t:[=[,]]"); System.err.println(" t:.[=