libhtmlcleaner-java-2.2/0000755000175000017500000000000012150221763015544 5ustar stappersstapperslibhtmlcleaner-java-2.2/default.xml0000644000175000017500000006035012032242274017715 0ustar stappersstappers a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml nobr a map area map a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font li,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml dt,dd dt,dd a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml tr,tbody,thead,tfoot,colgroup,col,form,caption,tr a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font tr,thead,tbody,tfoot,caption,colgroup,table,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param table tbody td,th thead,tfoot tr,td,th,caption,colgroup table tr td,th,caption,colgroup table tr td,th,caption,colgroup table tr,form td,th,tr,tbody,thead,tfoot,caption,colgroup table tr,form td,th,tr,tbody,thead,tfoot,caption,colgroup table tr,form td,th,tr,tbody,thead,tfoot,caption,colgroup table table col td,th,tr,tbody,thead,tfoot,caption,colgroup table td,th,tr,tbody,thead,tfoot,caption,colgroup form a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml select,optgroup,option select,optgroup,option option,optgroup option,optgroup,select select option select option optgroup select,optgroup,option a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml fieldset legend a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml u,i,tt,sub,sup,big,small,strike,blink,s b,u,tt,sub,sup,big,small,strike,blink,s b,i,tt,sub,sup,big,small,strike,blink,s b,u,i,sub,sup,big,small,strike,blink,s b,u,i,tt,sup,big,small,strike,blink,s b,u,i,tt,sub,big,small,strike,blink,s b,u,i,tt,sub,sup,small,strike,blink,s b,u,i,tt,sub,sup,big,strike,blink,s b,u,i,tt,sub,sup,big,small,blink,s b,u,i,tt,sub,sup,big,small,strike,s a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml b,u,i,tt,sub,sup,big,small,strike,blink a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml libhtmlcleaner-java-2.2/build.xml0000644000175000017500000000531412150221763017370 0ustar stappersstappers HtmlCleaner libhtmlcleaner-java-2.2/config/0000755000175000017500000000000012032242274017010 5ustar stappersstapperslibhtmlcleaner-java-2.2/config/MANIFEST.MF0000644000175000017500000000022312032242274020437 0ustar stappersstappersManifest-Version: 1.0 Ant-Version: Apache Ant 1.6.5 Created-By: 1.5.0_08-b03 (Sun Microsystems Inc.) Main-Class: org.htmlcleaner.CommandLine libhtmlcleaner-java-2.2/src/0000755000175000017500000000000012032242274016332 5ustar stappersstapperslibhtmlcleaner-java-2.2/src/main/0000755000175000017500000000000012032242274017256 5ustar stappersstapperslibhtmlcleaner-java-2.2/src/main/java/0000755000175000017500000000000012032242274020177 5ustar stappersstapperslibhtmlcleaner-java-2.2/src/main/java/org/0000755000175000017500000000000012032242274020766 5ustar stappersstapperslibhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/0000755000175000017500000000000012147655555023305 5ustar stappersstapperslibhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/PrettyXmlSerializer.java0000600000175000017500000001463712147655455030154 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.IOException; import java.io.Writer; import java.util.*; /** *

Pretty XML serializer - creates resulting XML with indenting lines.

*/ public class PrettyXmlSerializer extends XmlSerializer { private static final String DEFAULT_INDENTATION_STRING = "\t"; private String indentString = DEFAULT_INDENTATION_STRING; private List indents = new ArrayList(); public PrettyXmlSerializer(CleanerProperties props) { this(props, DEFAULT_INDENTATION_STRING); } public PrettyXmlSerializer(CleanerProperties props, String indentString) { super(props); this.indentString = indentString; } protected void serialize(TagNode tagNode, Writer writer) throws IOException { serializePrettyXml(tagNode, writer, 0); } /** * @param level * @return Appropriate indentation for the specified depth. */ private synchronized String getIndent(int level) { int size = indents.size(); if (size <= level) { String prevIndent = size == 0 ? null : indents.get(size - 1); for (int i = size; i <= level; i++) { String currIndent = prevIndent == null ? "" : prevIndent + indentString; indents.add(currIndent); prevIndent = currIndent; } } return indents.get(level); } private String getIndentedText(String content, int level) { String indent = getIndent(level); StringBuilder result = new StringBuilder( content.length() ); StringTokenizer tokenizer = new StringTokenizer(content, "\n\r"); while (tokenizer.hasMoreTokens()) { String line = tokenizer.nextToken().trim(); if (!"".equals(line)) { result.append(indent).append(line).append("\n"); } } return result.toString(); } private String getSingleLineOfChildren(List children) { StringBuilder result = new StringBuilder(); Iterator childrenIt = children.iterator(); boolean isFirst = true; while (childrenIt.hasNext()) { Object child = childrenIt.next(); if ( !(child instanceof ContentNode) ) { return null; } else { String content = child.toString(); // if first item trims it from left if (isFirst) { content = Utils.ltrim(content); } // if last item trims it from right if (!childrenIt.hasNext()) { content = Utils.rtrim(content); } if ( content.indexOf("\n") >= 0 || content.indexOf("\r") >= 0 ) { return null; } result.append(content); } isFirst = false; } return result.toString(); } protected void serializePrettyXml(TagNode tagNode, Writer writer, int level) throws IOException { List tagChildren = tagNode.getChildren(); boolean isHeadlessNode = Utils.isEmptyString(tagNode.getName()); String indent = isHeadlessNode ? "" : getIndent(level); writer.write(indent); serializeOpenTag(tagNode, writer, true); if ( !isMinimizedTagSyntax(tagNode) ) { String singleLine = getSingleLineOfChildren(tagChildren); boolean dontEscape = dontEscape(tagNode); if (singleLine != null) { if ( !dontEscape(tagNode) ) { writer.write( escapeXml(singleLine) ); } else { writer.write( singleLine.replaceAll("]]>", "]]>") ); } } else { if (!isHeadlessNode) { writer.write("\n"); } for (Object child: tagChildren) { if (child instanceof TagNode) { serializePrettyXml( (TagNode)child, writer, isHeadlessNode ? level : level + 1 ); } else if (child instanceof ContentNode) { String content = dontEscape ? child.toString().replaceAll("]]>", "]]>") : escapeXml(child.toString()); writer.write( getIndentedText(content, isHeadlessNode ? level : level + 1) ); } else if (child instanceof CommentNode) { CommentNode commentNode = (CommentNode) child; String content = commentNode.getCommentedContent(); writer.write( getIndentedText(content, isHeadlessNode ? level : level + 1) ); } } } if (singleLine == null) { writer.write(indent); } serializeEndTag(tagNode, writer, true); } } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/SimpleHtmlSerializer.java0000600000175000017500000000515312147655455030253 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.*; /** *

Simple HTML serializer - creates resulting HTML without indenting and/or compacting.

*/ public class SimpleHtmlSerializer extends HtmlSerializer { public SimpleHtmlSerializer(CleanerProperties props) { super(props); } protected void serialize(TagNode tagNode, Writer writer) throws IOException { serializeOpenTag(tagNode, writer, false); if ( !isMinimizedTagSyntax(tagNode) ) { for (Object item: tagNode.getChildren()) { if ( item instanceof ContentNode) { String content = item.toString(); writer.write( dontEscape(tagNode) ? content : escapeText(content) ); } else if (item instanceof BaseToken) { ((BaseToken)item).serialize(this, writer); } } serializeEndTag(tagNode, writer, false); } } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/Serializer.java0000600000175000017500000002336412147655455026260 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.*; import java.util.*; /** *

Basic abstract serializer - contains common logic for descendants (methods writeXXX().

*/ public abstract class Serializer { /** * Used to implement serialization with missing envelope - omiting open and close tags, just * serialize children. */ private class HeadlessTagNode extends TagNode { private HeadlessTagNode(TagNode wrappedNode) { super(""); getAttributes().putAll(wrappedNode.getAttributes()); getChildren().addAll(wrappedNode.getChildren()); setDocType(wrappedNode.getDocType()); Map nsDecls = getNamespaceDeclarations(); if (nsDecls != null) { Map wrappedNSDecls = wrappedNode.getNamespaceDeclarations(); if (wrappedNSDecls != null) { nsDecls.putAll(wrappedNSDecls); } } } } protected CleanerProperties props; protected Serializer(CleanerProperties props) { this.props = props; } /** * Writes specified TagNode to the output stream, using specified charset and optionally omits node envelope * (skips open and close tags of the node). * @param tagNode Node to be written * @param out Output stream * @param charset Charset of the output * @param omitEnvelope Tells whether to skip open and close tag of the node. * @throws IOException */ public void writeToStream(TagNode tagNode, OutputStream out, String charset, boolean omitEnvelope) throws IOException { write( tagNode, new OutputStreamWriter(out, charset), charset, omitEnvelope ); } /** * Writes specified TagNode to the output stream, using specified charset. * @param tagNode Node to be written * @param out Output stream * @param charset Charset of the output * @throws IOException */ public void writeToStream(TagNode tagNode, OutputStream out, String charset) throws IOException { writeToStream(tagNode, out, charset, false); } /** * Writes specified TagNode to the output stream, using system default charset and optionally omits node envelope * (skips open and close tags of the node). * @param tagNode Node to be written * @param out Output stream * @param omitEnvelope Tells whether to skip open and close tag of the node. * @throws IOException */ public void writeToStream(TagNode tagNode, OutputStream out, boolean omitEnvelope) throws IOException { writeToStream( tagNode, out, HtmlCleaner.DEFAULT_CHARSET, omitEnvelope ); } /** * Writes specified TagNode to the output stream, using system default charset. * @param tagNode Node to be written * @param out Output stream * @throws IOException */ public void writeToStream(TagNode tagNode, OutputStream out) throws IOException { writeToStream(tagNode, out, false); } /** * Writes specified TagNode to the file, using specified charset and optionally omits node envelope * (skips open and close tags of the node). * @param tagNode Node to be written * @param fileName Output file name * @param charset Charset of the output * @param omitEnvelope Tells whether to skip open and close tag of the node. * @throws IOException */ public void writeToFile(TagNode tagNode, String fileName, String charset, boolean omitEnvelope) throws IOException { writeToStream(tagNode, new FileOutputStream(fileName), charset, omitEnvelope ); } /** * Writes specified TagNode to the file, using specified charset. * @param tagNode Node to be written * @param fileName Output file name * @param charset Charset of the output * @throws IOException */ public void writeToFile(TagNode tagNode, String fileName, String charset) throws IOException { writeToFile(tagNode, fileName, charset, false); } /** * Writes specified TagNode to the file, using specified charset and optionally omits node envelope * (skips open and close tags of the node). * @param tagNode Node to be written * @param fileName Output file name * @param omitEnvelope Tells whether to skip open and close tag of the node. * @throws IOException */ public void writeToFile(TagNode tagNode, String fileName, boolean omitEnvelope) throws IOException { writeToFile(tagNode,fileName, HtmlCleaner.DEFAULT_CHARSET, omitEnvelope); } /** * Writes specified TagNode to the file, using system default charset. * @param tagNode Node to be written * @param fileName Output file name * @throws IOException */ public void writeToFile(TagNode tagNode, String fileName) throws IOException { writeToFile(tagNode, fileName, false); } /** * @param tagNode Node to serialize to string * @param charset Charset of the output - stands in xml declaration part * @param omitEnvelope Tells whether to skip open and close tag of the node. * @return Output as string * @throws IOException */ public String getAsString(TagNode tagNode, String charset, boolean omitEnvelope) throws IOException { StringWriter writer = new StringWriter(); write(tagNode, writer, charset, omitEnvelope); return writer.getBuffer().toString(); } /** * @param tagNode Node to serialize to string * @param charset Charset of the output - stands in xml declaration part * @return Output as string * @throws IOException */ public String getAsString(TagNode tagNode, String charset) throws IOException { return getAsString(tagNode, charset, false); } /** * @param tagNode Node to serialize to string * @param omitEnvelope Tells whether to skip open and close tag of the node. * @return Output as string * @throws IOException */ public String getAsString(TagNode tagNode, boolean omitEnvelope) throws IOException { return getAsString(tagNode, HtmlCleaner.DEFAULT_CHARSET, omitEnvelope); } /** * @param tagNode Node to serialize to string * @return Output as string * @throws IOException */ public String getAsString(TagNode tagNode) throws IOException { return getAsString(tagNode, false); } /** * Writes specified node using specified writer. * @param tagNode Node to serialize. * @param writer Writer instance * @param charset Charset of the output * @throws IOException */ public void write(TagNode tagNode, Writer writer, String charset) throws IOException { write(tagNode, writer, charset, false); } /** * Writes specified node using specified writer. * @param tagNode Node to serialize. * @param writer Writer instance * @param charset Charset of the output * @param omitEnvelope Tells whether to skip open and close tag of the node. * @throws IOException */ public void write(TagNode tagNode, Writer writer, String charset, boolean omitEnvelope) throws IOException { if (omitEnvelope) { tagNode = new HeadlessTagNode(tagNode); } writer = new BufferedWriter(writer); if ( !props.isOmitXmlDeclaration() ) { String declaration = ""; writer.write(declaration + "\n"); } if ( !props.isOmitDoctypeDeclaration() ) { DoctypeToken doctypeToken = tagNode.getDocType(); if ( doctypeToken != null ) { doctypeToken.serialize(this, writer); } } serialize(tagNode, writer); writer.flush(); writer.close(); } protected boolean isScriptOrStyle(TagNode tagNode) { String tagName = tagNode.getName(); return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName); } protected abstract void serialize(TagNode tagNode, Writer writer) throws IOException; } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/DefaultTagProvider.java0000600000175000017500000006171312147655455027702 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.util.HashMap; /** * This class is automatically created from ConfigFileTagProvider which reads * default XML configuration file with tag descriptions. * It is used as default tag info provider. * Class is created for performance purposes - parsing XML file requires some * processing time. */ public class DefaultTagProvider extends HashMap implements ITagInfoProvider { // singleton instance, used if no other TagInfoProvider is specified private static DefaultTagProvider _instance; /** * @return Singleton instance of this class. */ public static synchronized DefaultTagProvider getInstance() { if (_instance == null) { _instance = new DefaultTagProvider(); } return _instance; } public DefaultTagProvider() { TagInfo tagInfo; tagInfo = new TagInfo("div", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("div", tagInfo); tagInfo = new TagInfo("span", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("span", tagInfo); tagInfo = new TagInfo("meta", TagInfo.CONTENT_NONE, TagInfo.HEAD, false, false, false); this.put("meta", tagInfo); tagInfo = new TagInfo("link", TagInfo.CONTENT_NONE, TagInfo.HEAD, false, false, false); this.put("link", tagInfo); tagInfo = new TagInfo("title", TagInfo.CONTENT_TEXT, TagInfo.HEAD, false, true, false); this.put("title", tagInfo); tagInfo = new TagInfo("style", TagInfo.CONTENT_TEXT, TagInfo.HEAD, false, false, false); this.put("style", tagInfo); tagInfo = new TagInfo("bgsound", TagInfo.CONTENT_NONE, TagInfo.HEAD, false, false, false); this.put("bgsound", tagInfo); tagInfo = new TagInfo("h1", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("h1", tagInfo); tagInfo = new TagInfo("h2", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("h2", tagInfo); tagInfo = new TagInfo("h3", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("h3", tagInfo); tagInfo = new TagInfo("h4", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("h4", tagInfo); tagInfo = new TagInfo("h5", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("h5", tagInfo); tagInfo = new TagInfo("h6", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("h6", tagInfo); tagInfo = new TagInfo("p", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("p", tagInfo); tagInfo = new TagInfo("strong", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("strong", tagInfo); tagInfo = new TagInfo("em", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("em", tagInfo); tagInfo = new TagInfo("abbr", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("abbr", tagInfo); tagInfo = new TagInfo("acronym", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("acronym", tagInfo); tagInfo = new TagInfo("address", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("address", tagInfo); tagInfo = new TagInfo("bdo", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("bdo", tagInfo); tagInfo = new TagInfo("blockquote", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("blockquote", tagInfo); tagInfo = new TagInfo("cite", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("cite", tagInfo); tagInfo = new TagInfo("q", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("q", tagInfo); tagInfo = new TagInfo("code", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("code", tagInfo); tagInfo = new TagInfo("ins", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("ins", tagInfo); tagInfo = new TagInfo("del", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("del", tagInfo); tagInfo = new TagInfo("dfn", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("dfn", tagInfo); tagInfo = new TagInfo("kbd", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("kbd", tagInfo); tagInfo = new TagInfo("pre", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("pre", tagInfo); tagInfo = new TagInfo("samp", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("samp", tagInfo); tagInfo = new TagInfo("listing", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("listing", tagInfo); tagInfo = new TagInfo("var", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("var", tagInfo); tagInfo = new TagInfo("br", TagInfo.CONTENT_NONE, TagInfo.BODY, false, false, false); this.put("br", tagInfo); tagInfo = new TagInfo("wbr", TagInfo.CONTENT_NONE, TagInfo.BODY, false, false, false); this.put("wbr", tagInfo); tagInfo = new TagInfo("nobr", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeTags("nobr"); this.put("nobr", tagInfo); tagInfo = new TagInfo("xmp", TagInfo.CONTENT_TEXT, TagInfo.BODY, false, false, false); this.put("xmp", tagInfo); tagInfo = new TagInfo("a", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeTags("a"); this.put("a", tagInfo); tagInfo = new TagInfo("base", TagInfo.CONTENT_NONE, TagInfo.HEAD, false, false, false); this.put("base", tagInfo); tagInfo = new TagInfo("img", TagInfo.CONTENT_NONE, TagInfo.BODY, false, false, false); this.put("img", tagInfo); tagInfo = new TagInfo("area", TagInfo.CONTENT_NONE, TagInfo.BODY, false, false, false); tagInfo.defineFatalTags("map"); tagInfo.defineCloseBeforeTags("area"); this.put("area", tagInfo); tagInfo = new TagInfo("map", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeTags("map"); this.put("map", tagInfo); tagInfo = new TagInfo("object", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("object", tagInfo); tagInfo = new TagInfo("param", TagInfo.CONTENT_NONE, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("param", tagInfo); tagInfo = new TagInfo("applet", TagInfo.CONTENT_ALL, TagInfo.BODY, true, false, false); this.put("applet", tagInfo); tagInfo = new TagInfo("xml", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("xml", tagInfo); tagInfo = new TagInfo("ul", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("ul", tagInfo); tagInfo = new TagInfo("ol", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("ol", tagInfo); tagInfo = new TagInfo("li", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("li,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("li", tagInfo); tagInfo = new TagInfo("dl", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("dl", tagInfo); tagInfo = new TagInfo("dt", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeTags("dt,dd"); this.put("dt", tagInfo); tagInfo = new TagInfo("dd", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeTags("dt,dd"); this.put("dd", tagInfo); tagInfo = new TagInfo("menu", TagInfo.CONTENT_ALL, TagInfo.BODY, true, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("menu", tagInfo); tagInfo = new TagInfo("dir", TagInfo.CONTENT_ALL, TagInfo.BODY, true, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("dir", tagInfo); tagInfo = new TagInfo("table", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineAllowedChildrenTags("tr,tbody,thead,tfoot,colgroup,col,form,caption,tr"); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("tr,thead,tbody,tfoot,caption,colgroup,table,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param"); this.put("table", tagInfo); tagInfo = new TagInfo("tr", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineFatalTags("table"); tagInfo.defineRequiredEnclosingTags("tbody"); tagInfo.defineAllowedChildrenTags("td,th"); tagInfo.defineHigherLevelTags("thead,tfoot"); tagInfo.defineCloseBeforeTags("tr,td,th,caption,colgroup"); this.put("tr", tagInfo); tagInfo = new TagInfo("td", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineFatalTags("table"); tagInfo.defineRequiredEnclosingTags("tr"); tagInfo.defineCloseBeforeTags("td,th,caption,colgroup"); this.put("td", tagInfo); tagInfo = new TagInfo("th", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineFatalTags("table"); tagInfo.defineRequiredEnclosingTags("tr"); tagInfo.defineCloseBeforeTags("td,th,caption,colgroup"); this.put("th", tagInfo); tagInfo = new TagInfo("tbody", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineFatalTags("table"); tagInfo.defineAllowedChildrenTags("tr,form"); tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup"); this.put("tbody", tagInfo); tagInfo = new TagInfo("thead", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineFatalTags("table"); tagInfo.defineAllowedChildrenTags("tr,form"); tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup"); this.put("thead", tagInfo); tagInfo = new TagInfo("tfoot", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineFatalTags("table"); tagInfo.defineAllowedChildrenTags("tr,form"); tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup"); this.put("tfoot", tagInfo); tagInfo = new TagInfo("col", TagInfo.CONTENT_NONE, TagInfo.BODY, false, false, false); tagInfo.defineFatalTags("table"); this.put("col", tagInfo); tagInfo = new TagInfo("colgroup", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineFatalTags("table"); tagInfo.defineAllowedChildrenTags("col"); tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup"); this.put("colgroup", tagInfo); tagInfo = new TagInfo("caption", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineFatalTags("table"); tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup"); this.put("caption", tagInfo); tagInfo = new TagInfo("form", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, true); tagInfo.defineForbiddenTags("form"); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("form", tagInfo); tagInfo = new TagInfo("input", TagInfo.CONTENT_NONE, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeTags("select,optgroup,option"); this.put("input", tagInfo); tagInfo = new TagInfo("textarea", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeTags("select,optgroup,option"); this.put("textarea", tagInfo); tagInfo = new TagInfo("select", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, true); tagInfo.defineAllowedChildrenTags("option,optgroup"); tagInfo.defineCloseBeforeTags("option,optgroup,select"); this.put("select", tagInfo); tagInfo = new TagInfo("option", TagInfo.CONTENT_TEXT, TagInfo.BODY, false, false, true); tagInfo.defineFatalTags("select"); tagInfo.defineCloseBeforeTags("option"); this.put("option", tagInfo); tagInfo = new TagInfo("optgroup", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, true); tagInfo.defineFatalTags("select"); tagInfo.defineAllowedChildrenTags("option"); tagInfo.defineCloseBeforeTags("optgroup"); this.put("optgroup", tagInfo); tagInfo = new TagInfo("button", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeTags("select,optgroup,option"); this.put("button", tagInfo); tagInfo = new TagInfo("label", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("label", tagInfo); tagInfo = new TagInfo("fieldset", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("fieldset", tagInfo); tagInfo = new TagInfo("legend", TagInfo.CONTENT_TEXT, TagInfo.BODY, false, false, false); tagInfo.defineRequiredEnclosingTags("fieldset"); tagInfo.defineCloseBeforeTags("legend"); this.put("legend", tagInfo); tagInfo = new TagInfo("isindex", TagInfo.CONTENT_NONE, TagInfo.BODY, true, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("isindex", tagInfo); tagInfo = new TagInfo("script", TagInfo.CONTENT_ALL, TagInfo.HEAD_AND_BODY, false, false, false); this.put("script", tagInfo); tagInfo = new TagInfo("noscript", TagInfo.CONTENT_ALL, TagInfo.HEAD_AND_BODY, false, false, false); this.put("noscript", tagInfo); tagInfo = new TagInfo("b", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseInsideCopyAfterTags("u,i,tt,sub,sup,big,small,strike,blink,s"); this.put("b", tagInfo); tagInfo = new TagInfo("i", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseInsideCopyAfterTags("b,u,tt,sub,sup,big,small,strike,blink,s"); this.put("i", tagInfo); tagInfo = new TagInfo("u", TagInfo.CONTENT_ALL, TagInfo.BODY, true, false, false); tagInfo.defineCloseInsideCopyAfterTags("b,i,tt,sub,sup,big,small,strike,blink,s"); this.put("u", tagInfo); tagInfo = new TagInfo("tt", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,big,small,strike,blink,s"); this.put("tt", tagInfo); tagInfo = new TagInfo("sub", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sup,big,small,strike,blink,s"); this.put("sub", tagInfo); tagInfo = new TagInfo("sup", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,big,small,strike,blink,s"); this.put("sup", tagInfo); tagInfo = new TagInfo("big", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,small,strike,blink,s"); this.put("big", tagInfo); tagInfo = new TagInfo("small", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,strike,blink,s"); this.put("small", tagInfo); tagInfo = new TagInfo("strike", TagInfo.CONTENT_ALL, TagInfo.BODY, true, false, false); tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,blink,s"); this.put("strike", tagInfo); tagInfo = new TagInfo("blink", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,s"); this.put("blink", tagInfo); tagInfo = new TagInfo("marquee", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("marquee", tagInfo); tagInfo = new TagInfo("s", TagInfo.CONTENT_ALL, TagInfo.BODY, true, false, false); tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,blink"); this.put("s", tagInfo); tagInfo = new TagInfo("hr", TagInfo.CONTENT_NONE, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("hr", tagInfo); tagInfo = new TagInfo("font", TagInfo.CONTENT_ALL, TagInfo.BODY, true, false, false); this.put("font", tagInfo); tagInfo = new TagInfo("basefont", TagInfo.CONTENT_NONE, TagInfo.BODY, true, false, false); this.put("basefont", tagInfo); tagInfo = new TagInfo("center", TagInfo.CONTENT_ALL, TagInfo.BODY, true, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("center", tagInfo); tagInfo = new TagInfo("comment", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("comment", tagInfo); tagInfo = new TagInfo("server", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("server", tagInfo); tagInfo = new TagInfo("iframe", TagInfo.CONTENT_ALL, TagInfo.BODY, false, false, false); this.put("iframe", tagInfo); tagInfo = new TagInfo("embed", TagInfo.CONTENT_NONE, TagInfo.BODY, false, false, false); tagInfo.defineCloseBeforeCopyInsideTags("a,bdo,strong,em,q,b,i,u,tt,sub,sup,big,small,strike,s,font"); tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml"); this.put("embed", tagInfo); } public TagInfo getTagInfo(String tagName) { return get(tagName); } /** * Removes tag info with specified name. * @param tagName Name of the tag to be removed from the tag provider. */ public void removeTagInfo(String tagName) { if (tagName != null) { remove(tagName.toLowerCase()); } } /** * Sets new tag info. * @param tagInfo tag info to be added to the provider. */ public void addTagInfo(TagInfo tagInfo) { if (tagInfo != null) { put(tagInfo.getName().toLowerCase(), tagInfo); } } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/HtmlSerializer.java0000600000175000017500000002070012147655455027074 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.*; import java.util.*; /** *

Abstract HTML serializer - contains common logic for descendants.

*/ public abstract class HtmlSerializer extends Serializer { protected HtmlSerializer(CleanerProperties props) { super(props); } protected boolean isMinimizedTagSyntax(TagNode tagNode) { final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName()); return tagInfo != null && !tagNode.hasChildren() && tagInfo.isEmptyTag(); } protected boolean dontEscape(TagNode tagNode) { return isScriptOrStyle(tagNode); } protected String escapeText(String s) { boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars(); boolean translateSpecialEntities = props.isTranslateSpecialEntities(); if (s != null) { int len = s.length(); StringBuilder result = new StringBuilder(len); for (int i = 0; i < len; i++) { char ch = s.charAt(i); if (ch == '&') { if (i < len-2 && s.charAt(i+1) == '#') { boolean isHex = Character.toLowerCase(s.charAt(i+2)) == 'x'; int charIndex = i + (isHex ? 3 : 2); int radix = isHex ? 16 : 10; String unicode = ""; while (charIndex < len) { char currCh = s.charAt(charIndex); if (currCh == ';') { break; } else if (Utils.isValidInt(unicode + currCh, radix)) { unicode += currCh; charIndex++; } else { charIndex--; break; } } if (Utils.isValidInt(unicode, radix)) { char unicodeChar = (char)Integer.parseInt(unicode, radix); if ( !Utils.isValidXmlChar(unicodeChar) ) { i = charIndex; } else if ( !Utils.isReservedXmlChar(unicodeChar) ) { result.append( recognizeUnicodeChars ? String.valueOf(unicodeChar) : "&#" + unicode + ";" ); i = charIndex; } else { i = charIndex; result.append("&#" + unicode + ";"); } } else { result.append(props.transResCharsToNCR ? "&#" + (int)'&' + ";" : "&"); } } else { // get minimal following sequence required to recognize some special entitiy String seq = s.substring(i, i + Math.min(SpecialEntity.getMaxEntityLength() + 2, len - i)); int semiIndex = seq.indexOf(';'); if (semiIndex > 0) { String entityKey = seq.substring(1, semiIndex); SpecialEntity entity = SpecialEntity.getEntity(entityKey); if (entity != null) { if (translateSpecialEntities) { result.append(props.isTransSpecialEntitiesToNCR() ? entity.getDecimalNCR() : entity.getCharacter()); } else { result.append(entity.getEscapedValue()); } i += entityKey.length() + 1; continue; } } String sub = s.substring(i); boolean isReservedSeq = false; for (Map.Entry entry: Utils.RESERVED_XML_CHARS.entrySet()) { seq = entry.getValue(); if ( sub.startsWith(seq) ) { result.append( props.transResCharsToNCR ? "&#" + (int)entry.getKey() + ";" : seq ); i += seq.length() - 1; isReservedSeq = true; break; } } if (!isReservedSeq) { result.append( props.transResCharsToNCR ? "&#" + (int)'&' + ";" : "&" ); } } } else if (Utils.isReservedXmlChar(ch)) { result.append( props.transResCharsToNCR ? "&#" + (int)ch + ";" : ch ); } else { result.append(ch); } } return result.toString(); } return null; } protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException { String tagName = tagNode.getName(); if (Utils.isEmptyString(tagName)) { return; } boolean nsAware = props.isNamespacesAware(); if (!nsAware && Utils.getXmlNSPrefix(tagName) != null ) { tagName = Utils.getXmlName(tagName); } writer.write("<" + tagName); for (Map.Entry entry: tagNode.getAttributes().entrySet()) { String attName = entry.getKey(); if (!nsAware && Utils.getXmlNSPrefix(attName) != null ) { attName = Utils.getXmlName(attName); } writer.write(" " + attName + "=\"" + escapeText(entry.getValue()) + "\""); } if (nsAware) { Map nsDeclarations = tagNode.getNamespaceDeclarations(); if (nsDeclarations != null) { for (Map.Entry entry: nsDeclarations.entrySet()) { String prefix = entry.getKey(); String att = "xmlns"; if (prefix.length() > 0) { att += ":" + prefix; } writer.write(" " + att + "=\"" + escapeText(entry.getValue()) + "\""); } } } if ( isMinimizedTagSyntax(tagNode) ) { writer.write(" />"); if (newLine) { writer.write("\n"); } } else { writer.write(">"); } } protected void serializeEndTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException { String tagName = tagNode.getName(); if (Utils.isEmptyString(tagName)) { return; } if (Utils.getXmlNSPrefix(tagName) != null && !props.isNamespacesAware()) { tagName = Utils.getXmlName(tagName); } writer.write( "" ); if (newLine) { writer.write("\n"); } } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/XmlSerializer.java0000600000175000017500000001770212147655455026740 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.*; import java.util.*; /** *

Abstract XML serializer - contains common logic for descendants.

*/ public abstract class XmlSerializer extends Serializer { protected XmlSerializer(CleanerProperties props) { super(props); } /** * @deprecated Use writeToStream() instead. */ @Deprecated public void writeXmlToStream(TagNode tagNode, OutputStream out, String charset) throws IOException { super.writeToStream(tagNode, out, charset); } /** * @deprecated Use writeToStream() instead. */ @Deprecated public void writeXmlToStream(TagNode tagNode, OutputStream out) throws IOException { super.writeToStream(tagNode, out); } /** * @deprecated Use writeToFile() instead. */ @Deprecated public void writeXmlToFile(TagNode tagNode, String fileName, String charset) throws IOException { super.writeToFile(tagNode, fileName, charset); } /** * @deprecated Use writeToFile() instead. */ @Deprecated public void writeXmlToFile(TagNode tagNode, String fileName) throws IOException { super.writeToFile(tagNode, fileName); } /** * @deprecated Use getAsString() instead. */ @Deprecated public String getXmlAsString(TagNode tagNode, String charset) throws IOException { return super.getAsString(tagNode, charset); } /** * @deprecated Use getAsString() instead. */ @Deprecated public String getXmlAsString(TagNode tagNode) throws IOException { return super.getAsString(tagNode); } /** * @deprecated Use write() instead. */ @Deprecated public void writeXml(TagNode tagNode, Writer writer, String charset) throws IOException { super.write(tagNode, writer, charset); } protected String escapeXml(String xmlContent) { return Utils.escapeXml(xmlContent, props, false); } protected boolean dontEscape(TagNode tagNode) { return props.isUseCdataForScriptAndStyle() && isScriptOrStyle(tagNode); } protected boolean isMinimizedTagSyntax(TagNode tagNode) { final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName()); return tagNode.getChildren().size() == 0 && ( props.isUseEmptyElementTags() || (tagInfo != null && tagInfo.isEmptyTag()) ); } protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException { String tagName = tagNode.getName(); if (Utils.isEmptyString(tagName)) { return; } boolean nsAware = props.isNamespacesAware(); Set definedNSPrefixes = null; Set additionalNSDeclNeeded = null; String tagPrefix = Utils.getXmlNSPrefix(tagName); if (tagPrefix != null) { if (nsAware) { definedNSPrefixes = new HashSet(); tagNode.collectNamespacePrefixesOnPath(definedNSPrefixes); if ( !definedNSPrefixes.contains(tagPrefix) ) { additionalNSDeclNeeded = new TreeSet(); additionalNSDeclNeeded.add(tagPrefix); } } else { tagName = Utils.getXmlName(tagName); } } writer.write("<" + tagName); // write attributes for (Map.Entry entry: tagNode.getAttributes().entrySet()) { String attName = entry.getKey(); String attPrefix = Utils.getXmlNSPrefix(attName); if (attPrefix != null) { if (nsAware) { // collect used namespace prefixes in attributes in order to explicitly define // ns declaration if needed; otherwise it would be ill-formed xml if (definedNSPrefixes == null) { definedNSPrefixes = new HashSet(); tagNode.collectNamespacePrefixesOnPath(definedNSPrefixes); } if ( !definedNSPrefixes.contains(attPrefix) ) { if (additionalNSDeclNeeded == null) { additionalNSDeclNeeded = new TreeSet(); } additionalNSDeclNeeded.add(attPrefix); } } else { attName = Utils.getXmlName(attName); } } writer.write(" " + attName + "=\"" + escapeXml(entry.getValue()) + "\""); } // write namespace declarations if (nsAware) { Map nsDeclarations = tagNode.getNamespaceDeclarations(); if (nsDeclarations != null) { for (Map.Entry entry: nsDeclarations.entrySet()) { String prefix = entry.getKey(); String att = "xmlns"; if (prefix.length() > 0) { att += ":" + prefix; } writer.write(" " + att + "=\"" + escapeXml(entry.getValue()) + "\""); } } } // write additional namespace declarations needed for this tag in order xml to be well-formed if (additionalNSDeclNeeded != null) { for (String prefix: additionalNSDeclNeeded) { writer.write(" xmlns:" + prefix + "=\"" + prefix + "\""); } } if ( isMinimizedTagSyntax(tagNode) ) { writer.write(" />"); if (newLine) { writer.write("\n"); } } else if (dontEscape(tagNode)) { writer.write(">"); } } protected void serializeEndTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException { String tagName = tagNode.getName(); if (Utils.isEmptyString(tagName)) { return; } if (dontEscape(tagNode)) { writer.write("]]>"); } if (Utils.getXmlNSPrefix(tagName) != null && !props.isNamespacesAware()) { tagName = Utils.getXmlName(tagName); } writer.write( "" ); if (newLine) { writer.write("\n"); } } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/DoctypeToken.java0000600000175000017500000000736412147655455026561 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.IOException; import java.io.Writer; import org.htmlcleaner.BaseToken; /** *

HTML doctype token.

*/ public class DoctypeToken implements BaseToken { private String part1; private String part2; private String part3; private String part4; public DoctypeToken(String part1, String part2, String part3, String part4) { this.part1 = part1 != null ? part1.toUpperCase() : part1; this.part2 = part2 != null ? part2.toUpperCase() : part2; this.part3 = clean(part3); this.part4 = clean(part4); } private String clean(String s) { if (s != null) { s = s.replace('>', ' '); s = s.replace('<', ' '); s = s.replace('&', ' '); s = s.replace('\'', ' '); s = s.replace('\"', ' '); } return s; } public boolean isValid() { if ( part1 == null || "".equals(part1) ) { return false; } if ( !"public".equalsIgnoreCase(part2) && !"system".equalsIgnoreCase(part2) ) { return false; } if ( "system".equalsIgnoreCase(part2) && part4 != null && !"".equals(part4) ) { return false; } if ( "public".equalsIgnoreCase(part2) && (part4 == null || "".equals(part4)) ) { return false; } return true; } public String getContent() { String result = ""; return result; } public String toString() { return getContent(); } public String getName() { return ""; } public void serialize(Serializer serializer, Writer writer) throws IOException { writer.write(getContent() + "\n"); } public String getPart1() { return part1; } public String getPart2() { return part2; } public String getPart3() { return part3; } public String getPart4() { return part4; } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/CompactHtmlSerializer.java0000600000175000017500000001041012147655455030400 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.*; import java.util.*; /** *

Compact HTML serializer - creates resulting HTML by stripping whitespaces wherever possible.

*/ public class CompactHtmlSerializer extends HtmlSerializer { private int openPreTags = 0; public CompactHtmlSerializer(CleanerProperties props) { super(props); } protected void serialize(TagNode tagNode, Writer writer) throws IOException { boolean isPreTag = "pre".equalsIgnoreCase(tagNode.getName()); if (isPreTag) { openPreTags++; } serializeOpenTag(tagNode, writer, false); List tagChildren = tagNode.getChildren(); if ( !isMinimizedTagSyntax(tagNode) ) { ListIterator childrenIt = tagChildren.listIterator(); while ( childrenIt.hasNext() ) { Object item = childrenIt.next(); if (item instanceof ContentNode) { String content = item.toString(); if (openPreTags > 0) { writer.write(content); } else { boolean startsWithSpace = content.length() > 0 && Character.isWhitespace( content.charAt(0) ); boolean endsWithSpace = content.length() > 1 && Character.isWhitespace( content.charAt(content.length() - 1) ); content = dontEscape(tagNode) ? content.trim() : escapeText(content.trim()); if (startsWithSpace) { writer.write(' '); } if (content.length() != 0) { writer.write(content); if (endsWithSpace) { writer.write(' '); } } if (childrenIt.hasNext()) { if ( !Utils.isWhitespaceString(childrenIt.next()) ) { writer.write("\n"); } childrenIt.previous(); } } } else if (item instanceof CommentNode) { String content = ((CommentNode) item).getCommentedContent().trim(); writer.write(content); } else if (item instanceof BaseToken) { ((BaseToken)item).serialize(this, writer); } } serializeEndTag(tagNode, writer, false); if (isPreTag) { openPreTags--; } } } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/HtmlCleanerForAnt.java0000600000175000017500000003102512147655455027450 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import org.apache.tools.ant.BuildException; import java.net.URL; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.FileOutputStream; import java.util.*; /** *

Support for ANT.

*/ public class HtmlCleanerForAnt extends org.apache.tools.ant.Task { private String text; private String src; private String dest; private String incharset = HtmlCleaner.DEFAULT_CHARSET; private String outcharset = HtmlCleaner.DEFAULT_CHARSET; private String taginfofile = null; private String outputtype = "simple"; private boolean advancedxmlescape = true; private boolean transrescharstoncr = false; private boolean usecdata = true; private boolean specialentities = true; private boolean transspecialentitiestoncr = false; private boolean unicodechars = true; private boolean omitunknowntags = false; private boolean treatunknowntagsascontent = false; private boolean omitdeprtags = false; private boolean treatdeprtagsascontent = false; private boolean omitcomments = false; private boolean omitxmldecl = false; private boolean omitdoctypedecl = true; private boolean omithtmlenvelope = false; private boolean useemptyelementtags = true; private boolean allowmultiwordattributes = true; private boolean allowhtmlinsideattributes = false; private boolean ignoreqe = true; private boolean namespacesaware = true; private String hyphenreplacement = "="; private String prunetags = ""; private String booleanatts = CleanerProperties.BOOL_ATT_SELF; private String nodebyxpath = null; private boolean omitenvelope = false; private String transform = null; public void setText(String text) { this.text = text; } public void setSrc(String src) { this.src = src; } public void setDest(String dest) { this.dest = dest; } public void setIncharset(String incharset) { this.incharset = incharset; } public void setOutcharset(String outcharset) { this.outcharset = outcharset; } public void setTaginfofile(String taginfofile) { this.taginfofile = taginfofile; } public void setOutputtype(String outputtype) { this.outputtype = outputtype; } public void setAdvancedxmlescape(boolean advancedxmlescape) { this.advancedxmlescape = advancedxmlescape; } public void setTransrescharstoncr(boolean transrescharstoncr) { this.transrescharstoncr = transrescharstoncr; } public void setUsecdata(boolean usecdata) { this.usecdata = usecdata; } public void setSpecialentities(boolean specialentities) { this.specialentities = specialentities; } public void setTransspecialentitiestoncr(boolean transspecialentitiestoncr) { this.transspecialentitiestoncr = transspecialentitiestoncr; } public void setUnicodechars(boolean unicodechars) { this.unicodechars = unicodechars; } public void setOmitunknowntags(boolean omitunknowntags) { this.omitunknowntags = omitunknowntags; } public void setTreatunknowntagsascontent(boolean treatunknowntagsascontent) { this.treatunknowntagsascontent = treatunknowntagsascontent; } public void setOmitdeprtags(boolean omitdeprtags) { this.omitdeprtags = omitdeprtags; } public void setTreatdeprtagsascontent(boolean treatdeprtagsascontent) { this.treatdeprtagsascontent = treatdeprtagsascontent; } public void setOmitcomments(boolean omitcomments) { this.omitcomments = omitcomments; } public void setOmitxmldecl(boolean omitxmldecl) { this.omitxmldecl = omitxmldecl; } public void setOmitdoctypedecl(boolean omitdoctypedecl) { this.omitdoctypedecl = omitdoctypedecl; } public void setOmithtmlenvelope(boolean omithtmlenvelope) { this.omithtmlenvelope = omithtmlenvelope; } public void setUseemptyelementtags(boolean useemptyelementtags) { this.useemptyelementtags = useemptyelementtags; } public void setAllowmultiwordattributes(boolean allowmultiwordattributes) { this.allowmultiwordattributes = allowmultiwordattributes; } public void setAllowhtmlinsideattributes(boolean allowhtmlinsideattributes) { this.allowhtmlinsideattributes = allowhtmlinsideattributes; } public void setIgnoreqe(boolean ignoreqe) { this.ignoreqe = ignoreqe; } public void setNamespacesaware(boolean namespacesaware) { this.namespacesaware = namespacesaware; } public void setHyphenreplacement(String hyphenreplacement) { this.hyphenreplacement = hyphenreplacement; } public void setPrunetags(String prunetags) { this.prunetags = prunetags; } public void setBooleanatts(String booleanatts) { this.booleanatts = booleanatts; } public void setNodebyxpath(String nodebyxpath) { this.nodebyxpath = nodebyxpath; } public void setOmitenvelope(boolean omitenvelope) { this.omitenvelope = omitenvelope; } public void setTransform(String transform) { this.transform = transform; } public void addText(String text) { this.text = text; } /** * Implementation of Ant task execution. * @throws BuildException */ public void execute() throws BuildException { HtmlCleaner cleaner; if ( this.taginfofile != null ) { cleaner = new HtmlCleaner(new ConfigFileTagProvider(new File(this.taginfofile))); } else { cleaner = new HtmlCleaner(); } if (text == null && src == null) { throw new BuildException("Eather attribute 'src' or text body containing HTML must be specified!"); } CleanerProperties props = cleaner.getProperties(); props.setAdvancedXmlEscape(this.advancedxmlescape); props.setTransResCharsToNCR(this.transrescharstoncr); props.setUseCdataForScriptAndStyle(this.usecdata); props.setTranslateSpecialEntities(this.specialentities); props.setTransSpecialEntitiesToNCR(this.transspecialentitiestoncr); props.setRecognizeUnicodeChars(this.unicodechars); props.setOmitUnknownTags(this.omitunknowntags); props.setTreatUnknownTagsAsContent(this.treatunknowntagsascontent); props.setOmitDeprecatedTags(this.omitdeprtags); props.setTreatDeprecatedTagsAsContent(this.treatdeprtagsascontent); props.setOmitComments(this.omitcomments); props.setOmitXmlDeclaration(this.omitxmldecl); props.setOmitDoctypeDeclaration(this.omitdoctypedecl); props.setOmitHtmlEnvelope(this.omithtmlenvelope); props.setUseEmptyElementTags(this.useemptyelementtags); props.setAllowMultiWordAttributes(this.allowmultiwordattributes); props.setAllowHtmlInsideAttributes(this.allowhtmlinsideattributes); props.setIgnoreQuestAndExclam(this.ignoreqe); props.setNamespacesAware(this.namespacesaware); props.setHyphenReplacementInComment(this.hyphenreplacement); props.setPruneTags(this.prunetags); props.setBooleanAttributeValues(this.booleanatts); // set cleaner transformation if specified in "transform" attribute // format of attribute is expected to be [=]|[=... // (separator is pipe character) if ( !Utils.isEmptyString(transform) ) { String[] transItems = Utils.tokenize(transform, "|"); Map transInfos = new TreeMap(); for (int i = 0; i < transItems.length; i++) { String item = transItems[i]; int index = item.indexOf('='); String key = index <= 0 ? item : item.substring(0, index); String value = index <= 0 ? null : item.substring(index + 1); transInfos.put(key, value); } CleanerTransformations transformations = new CleanerTransformations(); Iterator iterator = transInfos.entrySet().iterator(); while (iterator.hasNext()) { Map.Entry entry = (Map.Entry) iterator.next(); String tag = (String) entry.getKey(); String value = (String) entry.getValue(); Utils.updateTagTransformations(transformations, tag, value); } cleaner.setTransformations(transformations); } try { TagNode node; try { if ( src != null && (src.startsWith("http://") || src.startsWith("https://")) ) { node = cleaner.clean(new URL(src), incharset); } else if (src != null) { node = cleaner.clean(new File(src), incharset); } else { node = cleaner.clean(text); } } catch (IOException e) { throw new BuildException(e); } // if user specifies XPath expresssion to choose node for serialization, then // try to evaluate XPath and look for first TagNode instance in the resulting array if ( nodebyxpath != null ) { final Object[] xpathResult = node.evaluateXPath(nodebyxpath); for (int i = 0; i < xpathResult.length; i++) { if ( xpathResult[i] instanceof TagNode ) { node = (TagNode) xpathResult[i]; break; } } } OutputStream out; if ( dest == null || "".equals(dest.trim()) ) { out = System.out; } else { out = new FileOutputStream(dest); } if ( "compact".equals(outputtype) ) { new CompactXmlSerializer(props).writeToStream(node, out, outcharset, omitenvelope); } else if ( "browser-compact".equals(outputtype) ) { new BrowserCompactXmlSerializer(props).writeToStream(node, out, outcharset, omitenvelope); } else if ( "pretty".equals(outputtype) ) { new PrettyXmlSerializer(props).writeToStream(node, out, outcharset, omitenvelope); } else if ( "htmlsimple".equals(outputtype) ) { new SimpleHtmlSerializer(props).writeToStream(node, out, outcharset, omitenvelope); } else if ( "htmlcompact".equals(outputtype) ) { new CompactHtmlSerializer(props).writeToStream(node, out, outcharset, omitenvelope); } else if ( "htmlpretty".equals(outputtype) ) { new PrettyHtmlSerializer(props).writeToStream(node, out, outcharset, omitenvelope); } else { new SimpleXmlSerializer(props).writeToStream(node, out, outcharset, omitenvelope); } } catch (IOException e) { throw new BuildException(e); } catch (XPatherException e) { throw new BuildException(e); } } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/TagTransformation.java0000600000175000017500000001171012147655455027601 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.util.Map; import java.util.LinkedHashMap; /** * Describes how specified tag is transformed to another one, or is ignored during parsing */ public class TagTransformation { private String sourceTag; private String destTag; private boolean preserveSourceAttributes; private Map attributeTransformations; /** * Creates new tag transformation from source tag to target tag specifying whether * source tag attributes are preserved. * @param sourceTag Name of the tag to be transformed. * @param destTag Name of tag to which source tag is to be transformed. * @param preserveSourceAttributes Tells whether source tag attributes are preserved in transformation. */ public TagTransformation(String sourceTag, String destTag, boolean preserveSourceAttributes) { this.sourceTag = sourceTag.toLowerCase(); if (destTag == null) { this.destTag = null; } else { this.destTag = Utils.isValidXmlIdentifier(destTag) ? destTag.toLowerCase() : sourceTag; } this.preserveSourceAttributes = preserveSourceAttributes; } /** * Creates new tag transformation from source tag to target tag preserving * all source tag attributes. * @param sourceTag Name of the tag to be transformed. * @param destTag Name of tag to which source tag is to be transformed. */ public TagTransformation(String sourceTag, String destTag) { this(sourceTag, destTag, true); } /** * Creates new tag transformation in which specified tag will be skipped (ignored) * during parsing process. * @param sourceTag */ public TagTransformation(String sourceTag) { this(sourceTag, null); } /** * Adds new attribute transformation to this tag transformation. It tells how destination * attribute will look like. Small templating mechanism is used to describe attribute value: * all names between ${ and } inside the template are evaluated against source tag attributes. * That way one can make attribute values consist of mix of source tag attributes. * * @param targetAttName Name of the destination attribute * @param transformationDesc Template describing attribute value. */ public void addAttributeTransformation(String targetAttName, String transformationDesc) { if (attributeTransformations == null) { attributeTransformations = new LinkedHashMap(); } attributeTransformations.put(targetAttName.toLowerCase(), transformationDesc); } /** * Adds new attribute transformation in which destination attrbute will not exists * (simply removes it from list of attributes). * @param targetAttName */ public void addAttributeTransformation(String targetAttName) { addAttributeTransformation(targetAttName, null); } boolean hasAttributeTransformations() { return attributeTransformations != null; } String getSourceTag() { return sourceTag; } String getDestTag() { return destTag; } boolean isPreserveSourceAttributes() { return preserveSourceAttributes; } Map getAttributeTransformations() { return attributeTransformations; } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/XPatherException.java0000600000175000017500000000426512147655455027400 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; /** *

Exception that could occure during XPather evaluation.

*/ public class XPatherException extends Exception { public XPatherException() { this("Error in evaluating XPath expression!"); } public XPatherException(Throwable cause) { super(cause); } public XPatherException(String message) { super(message); } public XPatherException(String message, Throwable cause) { super(message, cause); } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/TagInfo.java0000600000175000017500000003145412147655455025475 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.util.*; /** *

* Class contains information about single HTML tag.
* It also contains rules for tag balancing. For each tag, list of dependant * tags may be defined. There are several kinds of dependancies used to reorder * tags: *

    *
  • * fatal tags - required outer tag - the tag will be ignored during * parsing (will be skipped) if this fatal tag is missing. For example, most web * browsers ignore elements TD, TR, TBODY if they are not in the context of TABLE tag. *
  • *
  • * required enclosing tags - if there is no such, it is implicitely * created. For example if TD is out of TR - open TR is created before. *
  • *
  • * forbidden tags - it is not allowed to occure inside - for example * FORM cannot be inside other FORM and it will be ignored during cleanup. *
  • *
  • * allowed children tags - for example TR allowes TD and TH. If there * are some dependant allowed tags defined then cleaner ignores other tags, treating * them as unallowed, unless they are in some other relationship with this tag. *
  • *
  • * higher level tags - for example for TR higher tags are THEAD, TBODY, TFOOT. *
  • *
  • * tags that must be closed and copied - for example, in * <a href="#"><div>.... tag A must be closed before DIV but * copied again inside DIV. *
  • *
  • * tags that must be closed before closing this tag and copied again after - * for example, in <i><b>at</i> first</b> text * tag B must be closed before closing I, but it must be copied again after resulting * finally in sequence: <i><b>at</b></i><b> first</b> text . *
  • *
*

* *

* Tag TR for instance (table row) may define the following dependancies: *

    *
  • fatal tag is table
  • *
  • required enclosing tag is tbody
  • *
  • allowed children tags are td,th
  • *
  • higher level tags are thead,tfoot
  • *
  • tags that muste be closed before are tr,td,th,caption,colgroup
  • *
* meaning the following:
*
    *
  • tr must be in context of table, otherwise it will be ignored,
  • *
  • tr may can be directly inside tbody, tfoot and thead, * otherwise tbody will be implicitely created in front of it.
  • *
  • tr can contain td and th, all other tags and content will be pushed out of current * limiting context, in the case of html tables, in front of enclosing table tag.
  • *
  • if previous open tag is one of tr, caption or colgroup, it will be implicitely closed.
  • *
*

*/ public class TagInfo { protected static final int HEAD_AND_BODY = 0; protected static final int HEAD = 1; protected static final int BODY = 2; protected static final int CONTENT_ALL = 0; protected static final int CONTENT_NONE = 1; protected static final int CONTENT_TEXT = 2; private String name; private int contentType; private Set mustCloseTags = new HashSet(); private Set higherTags = new HashSet(); private Set childTags = new HashSet(); private Set permittedTags = new HashSet(); private Set copyTags = new HashSet(); private Set continueAfterTags = new HashSet(); private int belongsTo = BODY; private String requiredParent = null; private String fatalTag = null; private boolean deprecated = false; private boolean unique = false; private boolean ignorePermitted = false; public TagInfo(String name, int contentType, int belongsTo, boolean depricated, boolean unique, boolean ignorePermitted) { this.name = name; this.contentType = contentType; this.belongsTo = belongsTo; this.deprecated = depricated; this.unique = unique; this.ignorePermitted = ignorePermitted; } public void defineFatalTags(String commaSeparatedListOfTags) { StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); while (tokenizer.hasMoreTokens()) { String currTag = tokenizer.nextToken(); this.fatalTag = currTag; this.higherTags.add(currTag); } } public void defineRequiredEnclosingTags(String commaSeparatedListOfTags) { StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); while (tokenizer.hasMoreTokens()) { String currTag = tokenizer.nextToken(); this.requiredParent = currTag; this.higherTags.add(currTag); } } public void defineForbiddenTags(String commaSeparatedListOfTags) { StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); while (tokenizer.hasMoreTokens()) { String currTag = tokenizer.nextToken(); this.permittedTags.add(currTag); } } public void defineAllowedChildrenTags(String commaSeparatedListOfTags) { StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); while (tokenizer.hasMoreTokens()) { String currTag = tokenizer.nextToken(); this.childTags.add(currTag); } } public void defineHigherLevelTags(String commaSeparatedListOfTags) { StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); while (tokenizer.hasMoreTokens()) { String currTag = tokenizer.nextToken(); this.higherTags.add(currTag); } } public void defineCloseBeforeCopyInsideTags(String commaSeparatedListOfTags) { StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); while (tokenizer.hasMoreTokens()) { String currTag = tokenizer.nextToken(); this.copyTags.add(currTag); this.mustCloseTags.add(currTag); } } public void defineCloseInsideCopyAfterTags(String commaSeparatedListOfTags) { StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); while (tokenizer.hasMoreTokens()) { String currTag = tokenizer.nextToken(); this.continueAfterTags.add(currTag); } } public void defineCloseBeforeTags(String commaSeparatedListOfTags) { StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ","); while (tokenizer.hasMoreTokens()) { String currTag = tokenizer.nextToken(); this.mustCloseTags.add(currTag); } } // getters and setters public String getName() { return name; } public void setName(String name) { this.name = name; } public int getContentType() { return contentType; } public Set getMustCloseTags() { return mustCloseTags; } public void setMustCloseTags(Set mustCloseTags) { this.mustCloseTags = mustCloseTags; } public Set getHigherTags() { return higherTags; } public void setHigherTags(Set higherTags) { this.higherTags = higherTags; } public Set getChildTags() { return childTags; } public void setChildTags(Set childTags) { this.childTags = childTags; } public Set getPermittedTags() { return permittedTags; } public void setPermittedTags(Set permittedTags) { this.permittedTags = permittedTags; } public Set getCopyTags() { return copyTags; } public void setCopyTags(Set copyTags) { this.copyTags = copyTags; } public Set getContinueAfterTags() { return continueAfterTags; } public void setContinueAfterTags(Set continueAfterTags) { this.continueAfterTags = continueAfterTags; } public String getRequiredParent() { return requiredParent; } public void setRequiredParent(String requiredParent) { this.requiredParent = requiredParent; } public int getBelongsTo() { return belongsTo; } public void setBelongsTo(int belongsTo) { this.belongsTo = belongsTo; } public String getFatalTag() { return fatalTag; } public void setFatalTag(String fatalTag) { this.fatalTag = fatalTag; } public boolean isDeprecated() { return deprecated; } public void setDeprecated(boolean deprecated) { this.deprecated = deprecated; } public boolean isUnique() { return unique; } public void setUnique(boolean unique) { this.unique = unique; } public boolean isIgnorePermitted() { return ignorePermitted; } public boolean isEmptyTag() { return CONTENT_NONE == contentType; } public void setIgnorePermitted(boolean ignorePermitted) { this.ignorePermitted = ignorePermitted; } // other functionality boolean allowsBody() { return CONTENT_NONE != contentType; } boolean isHigher(String tagName) { return higherTags.contains(tagName); } boolean isCopy(String tagName) { return copyTags.contains(tagName); } boolean hasCopyTags() { return !copyTags.isEmpty(); } boolean isContinueAfter(String tagName) { return continueAfterTags.contains(tagName); } boolean hasPermittedTags() { return !permittedTags.isEmpty(); } boolean isHeadTag() { return belongsTo == HEAD; } boolean isHeadAndBodyTag() { return belongsTo == HEAD || belongsTo == HEAD_AND_BODY; } boolean isMustCloseTag(TagInfo tagInfo) { if (tagInfo != null) { return mustCloseTags.contains( tagInfo.getName() ) || tagInfo.contentType == CONTENT_TEXT; } return false; } boolean allowsItem(BaseToken token) { if ( contentType != CONTENT_NONE && token instanceof TagToken ) { TagToken tagToken = (TagToken) token; String tagName = tagToken.getName(); if ( "script".equals(tagName) ) { return true; } } if (CONTENT_ALL == contentType) { if ( !childTags.isEmpty() ) { return token instanceof TagToken ? childTags.contains( ((TagToken)token).getName() ) : false; } else if ( !permittedTags.isEmpty() ) { return token instanceof TagToken ? !permittedTags.contains( ((TagToken)token).getName() ) : true; } return true; } else if ( CONTENT_TEXT == contentType ) { return !(token instanceof TagToken); } return false; } boolean allowsAnything() { return CONTENT_ALL == contentType && childTags.size() == 0; } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/BaseToken.java0000600000175000017500000000375312147655455026022 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.IOException; import java.io.Writer; /** *

* Base token interface. Tokens are individual entities recognized by HTML parser. *

*/ public interface BaseToken { public void serialize(Serializer serializer, Writer writer) throws IOException; } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/ContentNode.java0000600000175000017500000000463612147655455026370 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.IOException; import java.io.Writer; /** *

HTML text token.

*/ public class ContentNode implements BaseToken, HtmlNode { private StringBuilder content; public ContentNode(String content) { this.content = new StringBuilder(content); } ContentNode(char content[], int len) { this.content = new StringBuilder(len + 16); this.content.append(content, 0, len); } public String toString() { return content.toString(); } public StringBuilder getContent() { return content; } public void serialize(Serializer serializer, Writer writer) throws IOException { writer.write( content.toString() ); } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/Utils.java0000600000175000017500000004210012147655455025234 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.*; import java.net.URL; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; /** *

Common utilities.

*/ public class Utils { public static String VAR_START = "${"; public static String VAR_END = "}"; public static final Map RESERVED_XML_CHARS = new HashMap(); static { RESERVED_XML_CHARS.put('&', "&"); RESERVED_XML_CHARS.put('<', "<"); RESERVED_XML_CHARS.put('>', ">"); RESERVED_XML_CHARS.put('\"', """); RESERVED_XML_CHARS.put('\'', "'"); } /** * Trims specified string from left. * @param s */ public static String ltrim(String s) { if (s == null) { return null; } int index = 0; int len = s.length(); while ( index < len && Character.isWhitespace(s.charAt(index)) ) { index++; } return (index >= len) ? "" : s.substring(index); } /** * Trims specified string from right. * @param s */ public static String rtrim(String s) { if (s == null) { return null; } int len = s.length(); int index = len; while ( index > 0 && Character.isWhitespace(s.charAt(index-1)) ) { index--; } return (index <= 0) ? "" : s.substring(0, index); } public static String getCharsetFromContentTypeString(String contentType) { if (contentType != null) { String pattern = "charset=([a-z\\d\\-]*)"; Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(contentType); if (matcher.find()) { String charset = matcher.group(1); if (Charset.isSupported(charset)) { return charset; } } } return null; } public static String getCharsetFromContent(URL url) throws IOException { InputStream stream = url.openStream(); byte chunk[] = new byte[2048]; int bytesRead = stream.read(chunk); if (bytesRead > 0) { String startContent = new String(chunk); String pattern = "\\]"; Matcher matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(startContent); if (matcher.find()) { String charset = matcher.group(1); if (Charset.isSupported(charset)) { return charset; } } } return null; } public static boolean isHexadecimalDigit(char ch) { return Character.isDigit(ch) || ch == 'A' || ch == 'a' || ch == 'B' || ch == 'b' || ch == 'C' || ch == 'c' || ch == 'D' || ch == 'd' || ch == 'E' || ch == 'e' || ch == 'F' || ch == 'f'; } public static boolean isValidXmlChar(char ch) { return ((ch >= 0x20) && (ch <= 0xD7FF)) || (ch == 0x9) || (ch == 0xA) || (ch == 0xD) || ((ch >= 0xE000) && (ch <= 0xFFFD)) || ((ch >= 0x10000) && (ch <= 0x10FFFF)); } public static boolean isReservedXmlChar(char ch) { return RESERVED_XML_CHARS.containsKey(ch); } public static boolean isValidInt(String s, int radix) { try { Integer.parseInt(s, radix); return true; } catch (NumberFormatException e) { return false; } } /** * Escapes XML string. * @param s String to be escaped * @param props Cleaner properties gover affect escaping behaviour * @param isDomCreation Tells if escaped content will be part of the DOM */ public static String escapeXml(String s, CleanerProperties props, boolean isDomCreation) { boolean advanced = props.isAdvancedXmlEscape(); boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars(); boolean translateSpecialEntities = props.isTranslateSpecialEntities(); if (s != null) { int len = s.length(); StringBuilder result = new StringBuilder(len); for (int i = 0; i < len; i++) { char ch = s.charAt(i); if (ch == '&') { if ( (advanced || recognizeUnicodeChars) && (i < len-2) && (s.charAt(i+1) == '#') ) { boolean isHex = Character.toLowerCase(s.charAt(i+2)) == 'x'; int charIndex = i + (isHex ? 3 : 2); int radix = isHex ? 16 : 10; String unicode = ""; while (charIndex < len) { char currCh = s.charAt(charIndex); if (currCh == ';') { break; } else if (isValidInt(unicode + currCh, radix)) { unicode += currCh; charIndex++; } else { charIndex--; break; } } if (isValidInt(unicode, radix)) { char unicodeChar = (char)Integer.parseInt(unicode, radix); if ( !isValidXmlChar(unicodeChar) ) { i = charIndex; } else if ( !isReservedXmlChar(unicodeChar) ) { result.append( recognizeUnicodeChars ? String.valueOf(unicodeChar) : "&#" + unicode + ";" ); i = charIndex; } else { i = charIndex; result.append("&#" + unicode + ";"); } } else { result.append("&"); } } else { if (translateSpecialEntities) { // get minimal following sequence required to recognize some special entitiy String seq = s.substring(i, i + Math.min(SpecialEntity.getMaxEntityLength() + 2, len - i)); int semiIndex = seq.indexOf(';'); if (semiIndex > 0) { String entityKey = seq.substring(1, semiIndex); SpecialEntity entity = SpecialEntity.getEntity(entityKey); if (entity != null) { result.append(props.isTransSpecialEntitiesToNCR() ? entity.getDecimalNCR() : entity.getCharacter()); i += entityKey.length() + 1; continue; } } } if (advanced) { String sub = s.substring(i); boolean isReservedSeq = false; for (Map.Entry entry: RESERVED_XML_CHARS.entrySet()) { String seq = entry.getValue(); if ( sub.startsWith(seq) ) { result.append( isDomCreation ? entry.getKey() : (props.transResCharsToNCR ? "&#" + (int)entry.getKey() + ";" : seq) ); i += seq.length() - 1; isReservedSeq = true; break; } } if (!isReservedSeq) { result.append( isDomCreation ? "&" : (props.transResCharsToNCR ? "&#" + (int)'&' + ";" : RESERVED_XML_CHARS.get('&')) ); } continue; } result.append("&"); } } else if (isReservedXmlChar(ch)) { result.append( props.transResCharsToNCR ? "&#" + (int)ch + ";" : (isDomCreation ? ch : RESERVED_XML_CHARS.get(ch)) ); } else { result.append(ch); } } return result.toString(); } return null; } /** * Checks whether specified object's string representation is empty string (containing of only whitespaces). * @param object Object whose string representation is checked * @return true, if empty string, false otherwise */ public static boolean isWhitespaceString(Object object) { if (object != null) { String s = object.toString(); return s != null && "".equals(s.trim()); } return false; } /** * Checks if specified character can be part of xml identifier (tag name of attribute name) * and is not standard identifier character. * @param ch Character to be checked * @return True if it can be part of xml identifier */ public static boolean isIdentifierHelperChar(char ch) { return ':' == ch || '.' == ch || '-' == ch || '_' == ch; } /** * Chacks whether specified string can be valid tag name or attribute name in xml. * @param s String to be checked * @return True if string is valid xml identifier, false otherwise */ public static boolean isValidXmlIdentifier(String s) { if (s != null) { int len = s.length(); if (len == 0) { return false; } for (int i = 0; i < len; i++) { char ch = s.charAt(i); if ( (i == 0 && !Character.isUnicodeIdentifierStart(ch) && ch != '_') || (!Character.isUnicodeIdentifierStart(ch) && !Character.isDigit(ch) && !Utils.isIdentifierHelperChar(ch)) ) { return false; } } return true; } return false; } /** * @param o * @return True if specified string is null of contains only whitespace characters */ public static boolean isEmptyString(Object o) { return o == null || "".equals(o.toString().trim()); } /** * Evaluates string template for specified map of variables. Template string can contain * dynamic parts in the form of ${VARNAME}. Each such part is replaced with value of the * variable if such exists in the map, or with empty string otherwise. * * @param template Template string * @param variables Map of variables (can be null) * @return Evaluated string */ public static String evaluateTemplate(String template, Map variables) { if (template == null) { return template; } StringBuilder result = new StringBuilder(); int startIndex = template.indexOf(VAR_START); int endIndex = -1; while (startIndex >= 0 && startIndex < template.length()) { result.append( template.substring(endIndex + 1, startIndex) ); endIndex = template.indexOf(VAR_END, startIndex); if (endIndex > startIndex) { String varName = template.substring(startIndex + VAR_START.length(), endIndex); Object resultObj = variables != null ? variables.get(varName.toLowerCase()) : ""; result.append( resultObj == null ? "" : resultObj.toString() ); } startIndex = template.indexOf( VAR_START, Math.max(endIndex + VAR_END.length(), startIndex + 1) ); } result.append( template.substring(endIndex + 1) ); return result.toString(); } public static String[] tokenize(String s, String delimiters) { if (s == null) { return new String[] {}; } StringTokenizer tokenizer = new StringTokenizer(s, delimiters); String result[] = new String[tokenizer.countTokens()]; int index = 0; while (tokenizer.hasMoreTokens()) { result[index++] = tokenizer.nextToken(); } return result; } public static void updateTagTransformations(CleanerTransformations transformations, String key, String value) { int index = key.indexOf('.'); // new tag transformation case (tagname[=destname[,preserveatts]]) if (index <= 0) { String destTag = null; boolean preserveSourceAtts = true; if (value != null) { String[] tokens = tokenize(value, ",;"); if (tokens.length > 0) { destTag = tokens[0]; } if (tokens.length > 1) { preserveSourceAtts = "true".equalsIgnoreCase(tokens[1]) || "yes".equalsIgnoreCase(tokens[1]) || "1".equals(tokens[1]); } } TagTransformation newTagTrans = new TagTransformation(key, destTag, preserveSourceAtts); transformations.addTransformation(newTagTrans); } else { // attribute transformation description String[] parts = tokenize(key, "."); String tagName = parts[0]; TagTransformation trans = transformations.getTransformation(tagName); if (trans != null) { trans.addAttributeTransformation(parts[1], value); } } } /** * Checks if specified link is full URL. * * @param link * @return True, if full URl, false otherwise. */ public static boolean isFullUrl(String link) { if (link == null) { return false; } link = link.trim().toLowerCase(); return link.startsWith("http://") || link.startsWith("https://") || link.startsWith("file://"); } /** * Calculates full URL for specified page URL and link * which could be full, absolute or relative like there can * be found in A or IMG tags. */ public static String fullUrl(String pageUrl, String link) { if (isFullUrl(link)) { return link; } else if (link != null && link.startsWith("?")) { int qindex = pageUrl.indexOf('?'); int len = pageUrl.length(); if (qindex < 0) { return pageUrl + link; } else if (qindex == len - 1) { return pageUrl.substring(0, len - 1) + link; } else { return pageUrl + "&" + link.substring(1); } } boolean isLinkAbsolute = link.startsWith("/"); if (!isFullUrl(pageUrl)) { pageUrl = "http://" + pageUrl; } int slashIndex = isLinkAbsolute ? pageUrl.indexOf("/", 8) : pageUrl.lastIndexOf("/"); if (slashIndex <= 8) { pageUrl += "/"; } else { pageUrl = pageUrl.substring(0, slashIndex + 1); } return isLinkAbsolute ? pageUrl + link.substring(1) : pageUrl + link; } /** * @param name * @return For xml element name or attribute name returns prefix (part before :) or null if there is no prefix */ public static String getXmlNSPrefix(String name) { int colIndex = name.indexOf(':'); if (colIndex > 0) { return name.substring(0, colIndex); } return null; } /** * @param name * @return For xml element name or attribute name returns name after prefix (part after :) */ public static String getXmlName(String name) { int colIndex = name.indexOf(':'); if (colIndex > 0 && colIndex < name.length() - 1) { return name.substring(colIndex + 1); } return name; } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/HtmlCleaner.java0000600000175000017500000010105012147655455026332 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.*; import java.net.URL; import java.net.URLConnection; import java.util.*; /** * Main HtmlCleaner class. * *

It represents public interface to the user. It's task is to call tokenizer with * specified source HTML, traverse list of produced token list and create internal * object model. It also offers a set of methods to write resulting XML to string, * file or any output stream.

*

Typical usage is the following:

* * * // create an instance of HtmlCleaner * HtmlCleaner cleaner = new HtmlCleaner(); * * // take default cleaner properties * CleanerProperties props = cleaner.getProperties(); * * // customize cleaner's behaviour with property setters * props.setXXX(...); * * // Clean HTML taken from simple string, file, URL, input stream, * // input source or reader. Result is root node of created * // tree-like structure. Single cleaner instance may be safely used * // multiple times. * TagNode node = cleaner.clean(...); * * // optionally find parts of the DOM or modify some nodes * TagNode[] myNodes = node.getElementsByXXX(...); * // and/or * Object[] myNodes = node.evaluateXPath(xPathExpression); * // and/or * aNode.removeFromTree(); * // and/or * aNode.addAttribute(attName, attValue); * // and/or * aNode.removeAttribute(attName, attValue); * // and/or * cleaner.setInnerHtml(aNode, htmlContent); * // and/or do some other tree manipulation/traversal * * // serialize a node to a file, output stream, DOM, JDom... * new XXXSerializer(props).writeXmlXXX(aNode, ...); * myJDom = new JDomSerializer(props, true).createJDom(aNode); * myDom = new DomSerializer(props, true).createDOM(aNode); * */ public class HtmlCleaner { public static final String DEFAULT_CHARSET = System.getProperty("file.encoding"); /** * Contains information about single open tag */ private class TagPos { private int position; private String name; private TagInfo info; TagPos(int position, String name) { this.position = position; this.name = name; this.info = tagInfoProvider.getTagInfo(name); } } /** * Class that contains information and mathods for managing list of open, * but unhandled tags. */ private class OpenTags { private List list = new ArrayList(); private TagPos last = null; private Set set = new HashSet(); private boolean isEmpty() { return list.isEmpty(); } private void addTag(String tagName, int position) { last = new TagPos(position, tagName); list.add(last); set.add(tagName); } private void removeTag(String tagName) { ListIterator it = list.listIterator( list.size() ); while ( it.hasPrevious() ) { TagPos currTagPos = it.previous(); if (tagName.equals(currTagPos.name)) { it.remove(); break; } } last = list.isEmpty() ? null : list.get( list.size() - 1 ); } private TagPos findFirstTagPos() { return list.isEmpty() ? null : list.get(0); } private TagPos getLastTagPos() { return last; } private TagPos findTag(String tagName) { if (tagName != null) { ListIterator it = list.listIterator(list.size()); String fatalTag = null; TagInfo fatalInfo = tagInfoProvider.getTagInfo(tagName); if (fatalInfo != null) { fatalTag = fatalInfo.getFatalTag(); } while (it.hasPrevious()) { TagPos currTagPos = it.previous(); if (tagName.equals(currTagPos.name)) { return currTagPos; } else if (fatalTag != null && fatalTag.equals(currTagPos.name)) { // do not search past a fatal tag for this tag return null; } } } return null; } private boolean tagExists(String tagName) { TagPos tagPos = findTag(tagName); return tagPos != null; } private TagPos findTagToPlaceRubbish() { TagPos result = null, prev = null; if ( !isEmpty() ) { ListIterator it = list.listIterator( list.size() ); while ( it.hasPrevious() ) { result = it.previous(); if ( result.info == null || result.info.allowsAnything() ) { if (prev != null) { return prev; } } prev = result; } } return result; } private boolean tagEncountered(String tagName) { return set.contains(tagName); } /** * Checks if any of tags specified in the set are already open. * @param tags */ private boolean someAlreadyOpen(Set tags) { Iterator it = list.iterator(); while ( it.hasNext() ) { TagPos curr = it.next(); if ( tags.contains(curr.name) ) { return true; } } return false; } } private class CleanTimeValues { private OpenTags _openTags; private boolean _headOpened = false; private boolean _bodyOpened = false; private Set _headTags = new LinkedHashSet(); private Set allTags = new TreeSet(); private TagNode htmlNode; private TagNode bodyNode; private TagNode headNode; private TagNode rootNode; private Set pruneTagSet = new HashSet(); private Set pruneNodeSet = new HashSet(); } private CleanerProperties properties; private ITagInfoProvider tagInfoProvider; private CleanerTransformations transformations = null; /** * Constructor - creates cleaner instance with default tag info provider and default properties. */ public HtmlCleaner() { this(null, null); } /** * Constructor - creates the instance with specified tag info provider and default properties * @param tagInfoProvider Provider for tag filtering and balancing */ public HtmlCleaner(ITagInfoProvider tagInfoProvider) { this(tagInfoProvider, null); } /** * Constructor - creates the instance with default tag info provider and specified properties * @param properties Properties used during parsing and serializing */ public HtmlCleaner(CleanerProperties properties) { this(null, properties); } /** * Constructor - creates the instance with specified tag info provider and specified properties * @param tagInfoProvider Provider for tag filtering and balancing * @param properties Properties used during parsing and serializing */ public HtmlCleaner(ITagInfoProvider tagInfoProvider, CleanerProperties properties) { this.tagInfoProvider = tagInfoProvider == null ? DefaultTagProvider.getInstance() : tagInfoProvider; this.properties = properties == null ? new CleanerProperties() : properties; this.properties.tagInfoProvider = this.tagInfoProvider; } public TagNode clean(String htmlContent) { try { return clean( new StringReader(htmlContent) ); } catch (IOException e) { // should never happen because reading from StringReader throw new HtmlCleanerException(e); } } public TagNode clean(File file, String charset) throws IOException { FileInputStream in = new FileInputStream(file); Reader reader = new InputStreamReader(in, charset); return clean(reader); } public TagNode clean(File file) throws IOException { return clean(file, DEFAULT_CHARSET); } public TagNode clean(URL url, String charset) throws IOException { URLConnection urlConnection = url.openConnection(); if (charset == null) { charset = Utils.getCharsetFromContentTypeString( urlConnection.getHeaderField("Content-Type") ); } if (charset == null) { charset = Utils.getCharsetFromContent(url); } if (charset == null) { charset = DEFAULT_CHARSET; } return clean(url.openStream(), charset); } /** * Creates instance from the content downloaded from specified URL. * HTML encoding is resolved following the attempts in the sequence: * 1. reading Content-Type response header, 2. Analyzing META tags at the * beginning of the html, 3. Using platform's default charset. * @param url * @return * @throws IOException */ public TagNode clean(URL url) throws IOException { return clean(url, null); } public TagNode clean(InputStream in, String charset) throws IOException { return clean( new InputStreamReader(in, charset) ); } public TagNode clean(InputStream in) throws IOException { return clean(in, DEFAULT_CHARSET); } public TagNode clean(Reader reader) throws IOException { return clean(reader, new CleanTimeValues()); } /** * Basic version of the cleaning call. * @param reader * @return An instance of TagNode object which is the root of the XML tree. * @throws IOException */ public TagNode clean(Reader reader, final CleanTimeValues cleanTimeValues) throws IOException { cleanTimeValues._openTags = new OpenTags(); cleanTimeValues._headOpened = false; cleanTimeValues._bodyOpened = false; cleanTimeValues._headTags.clear(); cleanTimeValues.allTags.clear(); setPruneTags(properties.pruneTags, cleanTimeValues); cleanTimeValues.htmlNode = createTagNode("html", cleanTimeValues); cleanTimeValues.bodyNode = createTagNode("body", cleanTimeValues); cleanTimeValues.headNode = createTagNode("head", cleanTimeValues); cleanTimeValues.rootNode = null; cleanTimeValues.htmlNode.addChild(cleanTimeValues.headNode); cleanTimeValues.htmlNode.addChild(cleanTimeValues.bodyNode); HtmlTokenizer htmlTokenizer = new HtmlTokenizer(reader, properties, transformations, tagInfoProvider) { @Override void makeTree(List tokenList) { HtmlCleaner.this.makeTree( tokenList, tokenList.listIterator(tokenList.size() - 1), cleanTimeValues ); } @Override TagNode createTagNode(String name) { return HtmlCleaner.this.createTagNode(name, cleanTimeValues); } }; htmlTokenizer.start(); List nodeList = htmlTokenizer.getTokenList(); closeAll(nodeList, cleanTimeValues); createDocumentNodes(nodeList, cleanTimeValues); calculateRootNode(cleanTimeValues); // if there are some nodes to prune from tree if ( cleanTimeValues.pruneNodeSet != null && !cleanTimeValues.pruneNodeSet.isEmpty() ) { Iterator iterator = cleanTimeValues.pruneNodeSet.iterator(); while (iterator.hasNext()) { TagNode tagNode = (TagNode) iterator.next(); TagNode parent = tagNode.getParent(); if (parent != null) { parent.removeChild(tagNode); } } } cleanTimeValues.rootNode.setDocType( htmlTokenizer.getDocType() ); return cleanTimeValues.rootNode; } private TagNode createTagNode(String name, CleanTimeValues cleanTimeValues) { TagNode node = new TagNode(name); if ( cleanTimeValues.pruneTagSet != null && name != null && cleanTimeValues.pruneTagSet.contains(name.toLowerCase()) ) { cleanTimeValues.pruneNodeSet.add(node); } return node; } private TagNode makeTagNodeCopy(TagNode tagNode, CleanTimeValues cleanTimeValues) { TagNode copy = tagNode.makeCopy(); if ( cleanTimeValues.pruneTagSet != null && cleanTimeValues.pruneTagSet.contains(tagNode.getName()) ) { cleanTimeValues.pruneNodeSet.add(copy); } return copy; } /** * Assigns root node to internal variable. * Root node of the result depends on parameter "omitHtmlEnvelope". * If it is set, then first child of the body will be root node, * or html will be root node otherwise. */ private void calculateRootNode(CleanTimeValues cleanTimeValues) { cleanTimeValues.rootNode = cleanTimeValues.htmlNode; if (properties.omitHtmlEnvelope) { List bodyChildren = cleanTimeValues.bodyNode.getChildren(); if (bodyChildren != null) { for (Object child: bodyChildren) { // if found child that is tag itself, then return it if (child instanceof TagNode) { cleanTimeValues.rootNode = (TagNode)child; break; } } } } } /** * Add attributes from specified map to the specified tag. * If some attribute already exist it is preserved. * @param tag * @param attributes */ private void addAttributesToTag(TagNode tag, Map attributes) { if (attributes != null) { Map tagAttributes = tag.getAttributes(); Iterator it = attributes.entrySet().iterator(); while (it.hasNext()) { Map.Entry currEntry = (Map.Entry) it.next(); String attName = (String) currEntry.getKey(); if ( !tagAttributes.containsKey(attName) ) { String attValue = (String) currEntry.getValue(); tag.setAttribute(attName, attValue); } } } } /** * Checks if open fatal tag is missing if there is a fatal tag for * the specified tag. * @param tag */ private boolean isFatalTagSatisfied(TagInfo tag, CleanTimeValues cleanTimeValues) { if (tag != null) { String fatalTagName = tag.getFatalTag(); return fatalTagName == null ? true : cleanTimeValues._openTags.tagExists(fatalTagName); } return true; } /** * Check if specified tag requires parent tag, but that parent * tag is missing in the appropriate context. * @param tag */ private boolean mustAddRequiredParent(TagInfo tag, CleanTimeValues cleanTimeValues) { if (tag != null) { String requiredParent = tag.getRequiredParent(); if (requiredParent != null) { String fatalTag = tag.getFatalTag(); int fatalTagPositon = -1; if (fatalTag != null) { TagPos tagPos = cleanTimeValues._openTags.findTag(fatalTag); if (tagPos != null) { fatalTagPositon = tagPos.position; } } // iterates through the list of open tags from the end and check if there is some higher ListIterator it = cleanTimeValues._openTags.list.listIterator( cleanTimeValues._openTags.list.size() ); while ( it.hasPrevious() ) { TagPos currTagPos = it.previous(); if (tag.isHigher(currTagPos.name)) { return currTagPos.position <= fatalTagPositon; } } return true; } } return false; } private TagNode createTagNode(TagNode startTagToken) { startTagToken.setFormed(); return startTagToken; } private boolean isAllowedInLastOpenTag(BaseToken token, CleanTimeValues cleanTimeValues) { TagPos last = cleanTimeValues._openTags.getLastTagPos(); if (last != null) { if (last.info != null) { return last.info.allowsItem(token); } } return true; } private void saveToLastOpenTag(List nodeList, BaseToken tokenToAdd, CleanTimeValues cleanTimeValues) { TagPos last = cleanTimeValues._openTags.getLastTagPos(); if ( last != null && last.info != null && last.info.isIgnorePermitted() ) { return; } TagPos rubbishPos = cleanTimeValues._openTags.findTagToPlaceRubbish(); if (rubbishPos != null) { TagNode startTagToken = (TagNode) nodeList.get(rubbishPos.position); startTagToken.addItemForMoving(tokenToAdd); } } private boolean isStartToken(Object o) { return (o instanceof TagNode) && !((TagNode)o).isFormed(); } void makeTree(List nodeList, ListIterator nodeIterator, CleanTimeValues cleanTimeValues) { // process while not reach the end of the list while ( nodeIterator.hasNext() ) { BaseToken token = nodeIterator.next(); if (token instanceof EndTagToken) { EndTagToken endTagToken = (EndTagToken) token; String tagName = endTagToken.getName(); TagInfo tag = tagInfoProvider.getTagInfo(tagName); if ( (tag == null && properties.omitUnknownTags) || (tag != null && tag.isDeprecated() && properties.omitDeprecatedTags) ) { nodeIterator.set(null); } else if ( tag != null && !tag.allowsBody() ) { nodeIterator.set(null); } else { TagPos matchingPosition = cleanTimeValues._openTags.findTag(tagName); if (matchingPosition != null) { List closed = closeSnippet(nodeList, matchingPosition, endTagToken, cleanTimeValues); nodeIterator.set(null); for (int i = closed.size() - 1; i >= 1; i--) { TagNode closedTag = (TagNode) closed.get(i); if ( tag != null && tag.isContinueAfter(closedTag.getName()) ) { nodeIterator.add( makeTagNodeCopy(closedTag, cleanTimeValues) ); nodeIterator.previous(); } } } else if ( !isAllowedInLastOpenTag(token, cleanTimeValues) ) { saveToLastOpenTag(nodeList, token, cleanTimeValues); nodeIterator.set(null); } } } else if ( isStartToken(token) ) { TagNode startTagToken = (TagNode) token; String tagName = startTagToken.getName(); TagInfo tag = tagInfoProvider.getTagInfo(tagName); TagPos lastTagPos = cleanTimeValues._openTags.isEmpty() ? null : cleanTimeValues._openTags.getLastTagPos(); TagInfo lastTagInfo = lastTagPos == null ? null : tagInfoProvider.getTagInfo(lastTagPos.name); // add tag to set of all tags cleanTimeValues.allTags.add(tagName); // HTML open tag if ( "html".equals(tagName) ) { addAttributesToTag(cleanTimeValues.htmlNode, startTagToken.getAttributes()); nodeIterator.set(null); // BODY open tag } else if ( "body".equals(tagName) ) { cleanTimeValues._bodyOpened = true; addAttributesToTag(cleanTimeValues.bodyNode, startTagToken.getAttributes()); nodeIterator.set(null); // HEAD open tag } else if ( "head".equals(tagName) ) { cleanTimeValues._headOpened = true; addAttributesToTag(cleanTimeValues.headNode, startTagToken.getAttributes()); nodeIterator.set(null); // unknown HTML tag and unknown tags are not allowed } else if ( (tag == null && properties.omitUnknownTags) || (tag != null && tag.isDeprecated() && properties.omitDeprecatedTags) ) { nodeIterator.set(null); // if current tag is unknown, unknown tags are allowed and last open tag doesn't allow any other tags in its body } else if ( tag == null && lastTagInfo != null && !lastTagInfo.allowsAnything() ) { saveToLastOpenTag(nodeList, token, cleanTimeValues); nodeIterator.set(null); } else if ( tag != null && tag.hasPermittedTags() && cleanTimeValues._openTags.someAlreadyOpen(tag.getPermittedTags()) ) { nodeIterator.set(null); // if tag that must be unique, ignore this occurence } else if ( tag != null && tag.isUnique() && cleanTimeValues._openTags.tagEncountered(tagName) ) { nodeIterator.set(null); // if there is no required outer tag without that this open tag is ignored } else if ( !isFatalTagSatisfied(tag, cleanTimeValues) ) { nodeIterator.set(null); // if there is no required parent tag - it must be added before this open tag } else if ( mustAddRequiredParent(tag, cleanTimeValues) ) { String requiredParent = tag.getRequiredParent(); TagNode requiredParentStartToken = createTagNode(requiredParent, cleanTimeValues); nodeIterator.previous(); nodeIterator.add(requiredParentStartToken); nodeIterator.previous(); // if last open tag has lower presidence then this, it must be closed } else if ( tag != null && lastTagPos != null && tag.isMustCloseTag(lastTagInfo) ) { List closed = closeSnippet(nodeList, lastTagPos, startTagToken, cleanTimeValues); int closedCount = closed.size(); // it is needed to copy some tags again in front of current, if there are any if ( tag.hasCopyTags() && closedCount > 0 ) { // first iterates over list from the back and collects all start tokens // in sequence that must be copied ListIterator closedIt = closed.listIterator(closedCount); List toBeCopied = new ArrayList(); while (closedIt.hasPrevious()) { TagNode currStartToken = (TagNode) closedIt.previous(); if ( tag.isCopy(currStartToken.getName()) ) { toBeCopied.add(0, currStartToken); } else { break; } } if (toBeCopied.size() > 0) { Iterator copyIt = toBeCopied.iterator(); while (copyIt.hasNext()) { TagNode currStartToken = (TagNode) copyIt.next(); nodeIterator.add( makeTagNodeCopy(currStartToken, cleanTimeValues) ); } // back to the previous place, before adding new start tokens for (int i = 0; i < toBeCopied.size(); i++) { nodeIterator.previous(); } } } nodeIterator.previous(); // if this open tag is not allowed inside last open tag, then it must be moved to the place where it can be } else if ( !isAllowedInLastOpenTag(token, cleanTimeValues) ) { saveToLastOpenTag(nodeList, token, cleanTimeValues); nodeIterator.set(null); // if it is known HTML tag but doesn't allow body, it is immediately closed } else if ( tag != null && !tag.allowsBody() ) { TagNode newTagNode = createTagNode(startTagToken); addPossibleHeadCandidate(tag, newTagNode, cleanTimeValues); nodeIterator.set(newTagNode); // default case - just remember this open tag and go further } else { cleanTimeValues._openTags.addTag( tagName, nodeIterator.previousIndex() ); } } else { if ( !isAllowedInLastOpenTag(token, cleanTimeValues) ) { saveToLastOpenTag(nodeList, token, cleanTimeValues); nodeIterator.set(null); } } } } private void createDocumentNodes(List listNodes, CleanTimeValues cleanTimeValues) { Iterator it = listNodes.iterator(); while (it.hasNext()) { Object child = it.next(); if (child == null) { continue; } boolean toAdd = true; if (child instanceof TagNode) { TagNode node = (TagNode) child; TagInfo tag = tagInfoProvider.getTagInfo( node.getName() ); addPossibleHeadCandidate(tag, node, cleanTimeValues); } else { if (child instanceof ContentNode) { toAdd = !"".equals(child.toString()); } } if (toAdd) { cleanTimeValues.bodyNode.addChild(child); } } // move all viable head candidates to head section of the tree Iterator headIterator = cleanTimeValues._headTags.iterator(); while (headIterator.hasNext()) { TagNode headCandidateNode = (TagNode) headIterator.next(); // check if this node is already inside a candidate for moving to head TagNode parent = headCandidateNode.getParent(); boolean toMove = true; while (parent != null) { if ( cleanTimeValues._headTags.contains(parent) ) { toMove = false; break; } parent = parent.getParent(); } if (toMove) { headCandidateNode.removeFromTree(); cleanTimeValues.headNode.addChild(headCandidateNode); } } } private List closeSnippet(List nodeList, TagPos tagPos, Object toNode, CleanTimeValues cleanTimeValues) { List closed = new ArrayList(); ListIterator it = nodeList.listIterator(tagPos.position); TagNode tagNode = null; Object item = it.next(); boolean isListEnd = false; while ( (toNode == null && !isListEnd) || (toNode != null && item != toNode) ) { if ( isStartToken(item) ) { TagNode startTagToken = (TagNode) item; closed.add(startTagToken); List itemsToMove = startTagToken.getItemsToMove(); if (itemsToMove != null) { OpenTags prevOpenTags = cleanTimeValues._openTags; cleanTimeValues._openTags = new OpenTags(); makeTree(itemsToMove, itemsToMove.listIterator(0), cleanTimeValues); closeAll(itemsToMove, cleanTimeValues); startTagToken.setItemsToMove(null); cleanTimeValues._openTags = prevOpenTags; } TagNode newTagNode = createTagNode(startTagToken); TagInfo tag = tagInfoProvider.getTagInfo( newTagNode.getName() ); addPossibleHeadCandidate(tag, newTagNode, cleanTimeValues); if (tagNode != null) { tagNode.addChildren(itemsToMove); tagNode.addChild(newTagNode); it.set(null); } else { if (itemsToMove != null) { itemsToMove.add(newTagNode); it.set(itemsToMove); } else { it.set(newTagNode); } } cleanTimeValues._openTags.removeTag( newTagNode.getName() ); tagNode = newTagNode; } else { if (tagNode != null) { it.set(null); if (item != null) { tagNode.addChild(item); } } } if ( it.hasNext() ) { item = it.next(); } else { isListEnd = true; } } return closed; } /** * Close all unclosed tags if there are any. */ private void closeAll(List nodeList, CleanTimeValues cleanTimeValues) { TagPos firstTagPos = cleanTimeValues._openTags.findFirstTagPos(); if (firstTagPos != null) { closeSnippet(nodeList, firstTagPos, null, cleanTimeValues); } } /** * Checks if specified tag with specified info is candidate for moving to head section. * @param tagInfo * @param tagNode */ private void addPossibleHeadCandidate(TagInfo tagInfo, TagNode tagNode, CleanTimeValues cleanTimeValues) { if (tagInfo != null && tagNode != null) { if ( tagInfo.isHeadTag() || (tagInfo.isHeadAndBodyTag() && cleanTimeValues._headOpened && !cleanTimeValues._bodyOpened) ) { cleanTimeValues._headTags.add(tagNode); } } } public CleanerProperties getProperties() { return properties; } private void setPruneTags(String pruneTags, CleanTimeValues cleanTimeValues) { cleanTimeValues.pruneTagSet.clear(); cleanTimeValues.pruneNodeSet.clear(); if (pruneTags != null) { StringTokenizer tokenizer = new StringTokenizer(pruneTags, ","); while ( tokenizer.hasMoreTokens() ) { cleanTimeValues.pruneTagSet.add( tokenizer.nextToken().trim().toLowerCase() ); } } } /** * @return ITagInfoProvider instance for this HtmlCleaner */ public ITagInfoProvider getTagInfoProvider() { return tagInfoProvider; } /** * @return Transormations defined for this instance of cleaner */ public CleanerTransformations getTransformations() { return transformations; } /** * Sets tranformations for this cleaner instance. * @param transformations */ public void setTransformations(CleanerTransformations transformations) { this.transformations = transformations; } /** * For the specified node, returns it's content as string. * @param node */ public String getInnerHtml(TagNode node) { if (node != null) { try { String content = new SimpleXmlSerializer(properties).getAsString(node); int index1 = content.indexOf("<" + node.getName()); index1 = content.indexOf('>', index1 + 1); int index2 = content.lastIndexOf('<'); return index1 >= 0 && index1 <= index2 ? content.substring(index1 + 1, index2) : null; } catch (IOException e) { throw new HtmlCleanerException(e); } } else { throw new HtmlCleanerException("Cannot return inner html of the null node!"); } } /** * For the specified tag node, defines it's html content. This causes cleaner to * reclean given html portion and insert it inside the node instead of previous content. * @param node * @param content */ public void setInnerHtml(TagNode node, String content) { if (node != null) { String nodeName = node.getName(); StringBuilder html = new StringBuilder(); html.append("<" + nodeName + " marker=''>"); html.append(content); html.append(""); TagNode parent = node.getParent(); while (parent != null) { String parentName = parent.getName(); html.insert(0, "<" + parentName + ">"); html.append(""); parent = parent.getParent(); } TagNode rootNode = clean( html.toString() ); TagNode cleanedNode = rootNode.findElementHavingAttribute("marker", true); if (cleanedNode != null) { node.setChildren( cleanedNode.getChildren() ); } } } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/JDomSerializer.java0000600000175000017500000001340112147655455027021 0ustar stappersstapperspackage org.htmlcleaner; import org.jdom.*; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; /** *

JDom serializer - creates xml JDom instance out of the TagNode.

*/ public class JDomSerializer { private DefaultJDOMFactory factory; protected CleanerProperties props; protected boolean escapeXml = true; public JDomSerializer(CleanerProperties props, boolean escapeXml) { this.props = props; this.escapeXml = escapeXml; } public JDomSerializer(CleanerProperties props) { this(props, true); } public Document createJDom(TagNode rootNode) { this.factory = new DefaultJDOMFactory(); Element rootElement = createElement(rootNode); Document document = this.factory.document(rootElement); setAttributes(rootNode, rootElement); createSubnodes(rootElement, rootNode.getChildren()); return document; } private Element createElement(TagNode node) { String name = node.getName(); boolean nsAware = props.isNamespacesAware(); String prefix = Utils.getXmlNSPrefix(name); Map nsDeclarations = node.getNamespaceDeclarations(); String nsURI = null; if (prefix != null) { name = Utils.getXmlName(name); if (nsAware) { if (nsDeclarations != null) { nsURI = nsDeclarations.get(prefix); } if (nsURI == null) { nsURI = node.getNamespaceURIOnPath(prefix); } if (nsURI == null) { nsURI = prefix; } } } else { if (nsAware) { if (nsDeclarations != null) { nsURI = nsDeclarations.get(""); } if (nsURI == null) { nsURI = node.getNamespaceURIOnPath(prefix); } } } Element element; if (nsAware && nsURI != null) { Namespace ns = prefix == null ? Namespace.getNamespace(nsURI) : Namespace.getNamespace(prefix, nsURI); element = factory.element(name, ns); } else { element = factory.element(name); } if (nsAware) { defineNamespaceDeclarations(node, element); } return element; } private void defineNamespaceDeclarations(TagNode node, Element element) { Map nsDeclarations = node.getNamespaceDeclarations(); if (nsDeclarations != null) { for (Map.Entry nsEntry: nsDeclarations.entrySet()) { String nsPrefix = nsEntry.getKey(); String nsURI = nsEntry.getValue(); Namespace ns = nsPrefix == null || "".equals(nsPrefix) ? Namespace.getNamespace(nsURI) : Namespace.getNamespace(nsPrefix, nsURI); element.addNamespaceDeclaration(ns); } } } private void setAttributes(TagNode node, Element element) { for (Map.Entry entry: node.getAttributes().entrySet()) { String attrName = entry.getKey(); String attrValue = entry.getValue(); if (escapeXml) { attrValue = Utils.escapeXml(attrValue, props, true); } String attPrefix = Utils.getXmlNSPrefix(attrName); Namespace ns = null; if (attPrefix != null) { attrName = Utils.getXmlName(attrName); if (props.isNamespacesAware()) { String nsURI = node.getNamespaceURIOnPath(attPrefix); if (nsURI == null) { nsURI = attPrefix; } ns = Namespace.getNamespace(attPrefix, nsURI); } } if (ns == null) { element.setAttribute(attrName, attrValue); } else { element.setAttribute(attrName, attrValue, ns); } } } private void createSubnodes(Element element, List tagChildren) { if (tagChildren != null) { Iterator it = tagChildren.iterator(); while (it.hasNext()) { Object item = it.next(); if (item instanceof CommentNode) { CommentNode commentNode = (CommentNode) item; Comment comment = factory.comment( commentNode.getContent().toString() ); element.addContent(comment); } else if (item instanceof ContentNode) { String nodeName = element.getName(); String content = item.toString(); boolean specialCase = props.isUseCdataForScriptAndStyle() && ("script".equalsIgnoreCase(nodeName) || "style".equalsIgnoreCase(nodeName)); if (escapeXml && !specialCase) { content = Utils.escapeXml(content, props, true); } Text text = specialCase ? factory.cdata(content) : factory.text(content); element.addContent(text); } else if (item instanceof TagNode) { TagNode subTagNode = (TagNode) item; Element subelement = createElement(subTagNode); setAttributes(subTagNode, subelement); // recursively create subnodes createSubnodes(subelement, subTagNode.getChildren()); element.addContent(subelement); } else if (item instanceof List) { List sublist = (List) item; createSubnodes(element, sublist); } } } } } libhtmlcleaner-java-2.2/src/main/java/org/htmlcleaner/CommandLine.java0000600000175000017500000003361112147655455026331 0ustar stappersstappers/* Copyright (c) 2006-2007, Vladimir Nikic All rights reserved. Redistribution and use of this software in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The name of HtmlCleaner may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact Vladimir Nikic by sending e-mail to nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the subject line. */ package org.htmlcleaner; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.FileOutputStream; import java.net.URL; import java.util.Map; import java.util.TreeMap; import java.util.Iterator; /** *

Command line usage class.

*/ public class CommandLine { private static String getArgValue(String[] args, String name) { for (int i = 0; i < args.length; i++) { String curr = args[i]; int eqIndex = curr.indexOf('='); if (eqIndex >= 0) { String argName = curr.substring(0, eqIndex).trim(); String argValue = curr.substring(eqIndex+1).trim(); if (argName.toLowerCase().startsWith(name.toLowerCase())) { return argValue; } } } return ""; } private static boolean toBoolean(String s) { return s != null && ( "on".equalsIgnoreCase(s) || "true".equalsIgnoreCase(s) || "yes".equalsIgnoreCase(s) ); } public static void main(String[] args) throws IOException, XPatherException { String source = getArgValue(args, "src"); if ( "".equals(source) ) { System.err.println("Usage: java -jar htmlcleanerXX.jar src = [incharset = ] " + "[dest = ] [outcharset = ] [taginfofile=] [options...]"); System.err.println(""); System.err.println("where options include:"); System.err.println(" outputtype=simple* | compact | browser-compact | pretty | htmlsimple | htmlcompact | htmlpretty"); System.err.println(" advancedxmlescape=true* | false"); System.err.println(" transrescharstoncr=true | false*"); System.err.println(" usecdata=true* | false"); System.err.println(" specialentities=true* | false"); System.err.println(" transspecialentitiestoncr=true | false*"); System.err.println(" unicodechars=true* | false"); System.err.println(" omitunknowntags=true | false*"); System.err.println(" treatunknowntagsascontent=true | false*"); System.err.println(" omitdeprtags=true | false*"); System.err.println(" treatdeprtagsascontent=true | false*"); System.err.println(" omitcomments=true | false*"); System.err.println(" omitxmldecl=true | false*"); System.err.println(" omitdoctypedecl=true* | false"); System.err.println(" useemptyelementtags=true* | false"); System.err.println(" allowmultiwordattributes=true* | false"); System.err.println(" allowhtmlinsideattributes=true | false*"); System.err.println(" ignoreqe=true* | false"); System.err.println(" namespacesaware=true* | false"); System.err.println(" hyphenreplacement= [=]"); System.err.println(" prunetags= []"); System.err.println(" booleanatts=self* | empty | true"); System.err.println(" nodebyxpath="); System.err.println(" omitenvelope=true | false*"); System.err.println(" t:[=[,]]"); System.err.println(" t:.[=