sanitize-7.0.0/ 0000755 0000041 0000041 00000000000 14744072357 013411 5 ustar www-data www-data sanitize-7.0.0/lib/ 0000755 0000041 0000041 00000000000 14744072357 014157 5 ustar www-data www-data sanitize-7.0.0/lib/sanitize.rb 0000644 0000041 0000041 00000017635 14744072357 016346 0 ustar www-data www-data # frozen_string_literal: true
require "nokogiri"
require "set"
require_relative "sanitize/version"
require_relative "sanitize/config"
require_relative "sanitize/config/default"
require_relative "sanitize/config/restricted"
require_relative "sanitize/config/basic"
require_relative "sanitize/config/relaxed"
require_relative "sanitize/css"
require_relative "sanitize/transformers/clean_cdata"
require_relative "sanitize/transformers/clean_comment"
require_relative "sanitize/transformers/clean_css"
require_relative "sanitize/transformers/clean_doctype"
require_relative "sanitize/transformers/clean_element"
class Sanitize
attr_reader :config
# Matches one or more control characters that should be removed from HTML
# before parsing, as defined by the HTML living standard.
#
# - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
# - https://infra.spec.whatwg.org/#control
REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
# Matches one or more non-characters that should be removed from HTML before
# parsing, as defined by the HTML living standard.
#
# - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
# - https://infra.spec.whatwg.org/#noncharacter
REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
# Matches an attribute value that could be treated by a browser as a URL with
# a protocol prefix, such as "http:" or "javascript:". Any string of zero or
# more characters followed by a colon is considered a match, even if the colon
# is encoded as an entity and even if it's an incomplete entity (which IE6 and
# Opera will still parse).
REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?::|*58|*3a)/i
# Matches one or more characters that should be stripped from HTML before
# parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
# `REGEX_HTML_NON_CHARACTERS`.
#
# https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
#--
# Class Methods
#++
# Returns a sanitized copy of the given full _html_ document, using the
# settings in _config_ if specified.
#
# When sanitizing a document, the `` element must be allowlisted or an
# error will be raised. If this is undesirable, you should probably use
# {#fragment} instead.
def self.document(html, config = {})
Sanitize.new(config).document(html)
end
# Returns a sanitized copy of the given _html_ fragment, using the settings in
# _config_ if specified.
def self.fragment(html, config = {})
Sanitize.new(config).fragment(html)
end
# Sanitizes the given `Nokogiri::XML::Node` instance and all its children.
def self.node!(node, config = {})
Sanitize.new(config).node!(node)
end
# Aliases for pre-3.0.0 backcompat.
class << Sanitize
# @deprecated Use {.document} instead.
alias_method :clean_document, :document
# @deprecated Use {.fragment} instead.
alias_method :clean, :fragment
# @deprecated Use {.node!} instead.
alias_method :clean_node!, :node!
end
#--
# Instance Methods
#++
# Returns a new Sanitize object initialized with the settings in _config_.
def initialize(config = {})
@config = Config.merge(Config::DEFAULT, config)
@transformers = Array(@config[:transformers]).dup
# Default transformers always run at the end of the chain, after any custom
# transformers.
@transformers << Transformers::CleanElement.new(@config)
@transformers << Transformers::CleanComment unless @config[:allow_comments]
if @config[:elements].include?("style")
scss = Sanitize::CSS.new(config)
@transformers << Transformers::CSS::CleanElement.new(scss)
end
if @config[:attributes].values.any? { |attr| attr.include?("style") }
scss ||= Sanitize::CSS.new(config)
@transformers << Transformers::CSS::CleanAttribute.new(scss)
end
@transformers << Transformers::CleanDoctype
@transformers << Transformers::CleanCDATA
@transformer_config = {config: @config}
end
# Returns a sanitized copy of the given _html_ document.
#
# When sanitizing a document, the `` element must be allowlisted or an
# error will be raised. If this is undesirable, you should probably use
# {#fragment} instead.
def document(html)
return "" unless html
doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
node!(doc)
to_html(doc)
end
# @deprecated Use {#document} instead.
alias_method :clean_document, :document
# Returns a sanitized copy of the given _html_ fragment.
def fragment(html)
return "" unless html
frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
node!(frag)
to_html(frag)
end
# @deprecated Use {#fragment} instead.
alias_method :clean, :fragment
# Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it
# in place.
#
# If _node_ is a `Nokogiri::XML::Document`, the `` element must be
# allowlisted or an error will be raised.
def node!(node)
raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
if node.is_a?(Nokogiri::XML::Document)
unless @config[:elements].include?("html")
raise Error, 'When sanitizing a document, "" must be allowlisted.'
end
end
node_allowlist = Set.new
traverse(node) do |n|
transform_node!(n, node_allowlist)
end
node
end
# @deprecated Use {#node!} instead.
alias_method :clean_node!, :node!
private
# Preprocesses HTML before parsing to remove undesirable Unicode chars.
def preprocess(html)
html = html.to_s.dup
unless html.encoding.name == "UTF-8"
html.encode!("UTF-8",
invalid: :replace,
undef: :replace)
end
html.gsub!(REGEX_UNSUITABLE_CHARS, "")
html
end
def to_html(node)
node.to_html(preserve_newline: true)
end
def transform_node!(node, node_allowlist)
@transformers.each do |transformer|
# Since transform_node! may be called in a tight loop to process thousands
# of items, we can optimize both memory and CPU performance by:
#
# 1. Reusing the same config hash for each transformer
# 2. Directly assigning values to hash instead of using merge!. Not only
# does merge! create a new hash, it is also 2.6x slower:
# https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
config = @transformer_config
config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
config[:node] = node
config[:node_name] = node.name.downcase
config[:node_allowlist] = config[:node_whitelist] = node_allowlist
result = transformer.call(**config)
if result.is_a?(Hash)
result_allowlist = result[:node_allowlist] || result[:node_whitelist]
if result_allowlist.respond_to?(:each)
node_allowlist.merge(result_allowlist)
end
end
end
node
end
# Performs top-down traversal of the given node, operating first on the node
# itself, then traversing each child (if any) in order.
def traverse(node, &block)
yield node
child = node.child
while child
prev = child.previous_sibling
traverse(child, &block)
child = if child.parent == node
child.next_sibling
else
# The child was unlinked or reparented, so traverse the previous node's
# next sibling, or the parent's first child if there is no previous
# node.
prev ? prev.next_sibling : node.child
end
end
end
class Error < StandardError; end
end
sanitize-7.0.0/lib/sanitize/ 0000755 0000041 0000041 00000000000 14744072357 016005 5 ustar www-data www-data sanitize-7.0.0/lib/sanitize/config.rb 0000644 0000041 0000041 00000003334 14744072357 017602 0 ustar www-data www-data # frozen_string_literal: true
require "set"
class Sanitize
module Config
# Deeply freezes and returns the given configuration Hash.
def self.freeze_config(config)
if Hash === config
config.each_value { |c| freeze_config(c) }
elsif Array === config || Set === config
config.each { |c| freeze_config(c) }
end
config.freeze
end
# Returns a new Hash containing the result of deeply merging *other_config*
# into *config*. Does not modify *config* or *other_config*.
#
# This is the safest way to use a built-in Sanitize config as the basis for
# your own custom config.
def self.merge(config, other_config = {})
raise ArgumentError, "config must be a Hash" unless Hash === config
raise ArgumentError, "other_config must be a Hash" unless Hash === other_config
merged = {}
keys = Set.new(config.keys + other_config.keys)
keys.each do |key|
oldval = config[key]
if other_config.has_key?(key)
newval = other_config[key]
merged[key] = if Hash === oldval && Hash === newval
oldval.empty? ? newval.dup : merge(oldval, newval)
elsif Array === newval && key != :transformers
Set.new(newval)
else
can_dupe?(newval) ? newval.dup : newval
end
else
merged[key] = can_dupe?(oldval) ? oldval.dup : oldval
end
end
merged
end
# Returns `true` if `dup` may be safely called on _value_, `false`
# otherwise.
def self.can_dupe?(value)
!(value == true || value == false || value.nil? || Method === value || Numeric === value || Symbol === value)
end
private_class_method :can_dupe?
end
end
sanitize-7.0.0/lib/sanitize/version.rb 0000644 0000041 0000041 00000000106 14744072357 020014 0 ustar www-data www-data # frozen_string_literal: true
class Sanitize
VERSION = "7.0.0"
end
sanitize-7.0.0/lib/sanitize/css.rb 0000644 0000041 0000041 00000024741 14744072357 017132 0 ustar www-data www-data # frozen_string_literal: true
require "crass"
require "set"
class Sanitize
class CSS
attr_reader :config
# -- Class Methods ---------------------------------------------------------
# Sanitizes inline CSS style properties.
#
# This is most useful for sanitizing non-stylesheet fragments of CSS like
# you would find in the `style` attribute of an HTML element. To sanitize a
# full CSS stylesheet, use {.stylesheet}.
#
# @example
# Sanitize::CSS.properties("background: url(foo.png); color: #fff;")
#
# @return [String] Sanitized CSS properties.
def self.properties(css, config = {})
new(config).properties(css)
end
# Sanitizes a full CSS stylesheet.
#
# A stylesheet may include selectors, at-rules, and comments. To sanitize
# only inline style properties such as the contents of an HTML `style`
# attribute, use {.properties}.
#
# @example
# css = %[
# .foo {
# background: url(foo.png);
# color: #fff;
# }
#
# #bar {
# font: 42pt 'Comic Sans MS';
# }
# ]
#
# Sanitize::CSS.stylesheet(css, Sanitize::Config::RELAXED)
#
# @return [String] Sanitized CSS stylesheet.
def self.stylesheet(css, config = {})
new(config).stylesheet(css)
end
# Sanitizes the given Crass CSS parse tree and all its children, modifying
# it in place.
#
# @example
# css = %[
# .foo {
# background: url(foo.png);
# color: #fff;
# }
#
# #bar {
# font: 42pt 'Comic Sans MS';
# }
# ]
#
# tree = Crass.parse(css)
# Sanitize::CSS.tree!(tree, Sanitize::Config::RELAXED)
#
# @return [Array] Sanitized Crass CSS parse tree.
def self.tree!(tree, config = {})
new(config).tree!(tree)
end
# -- Instance Methods ------------------------------------------------------
# Returns a new Sanitize::CSS object initialized with the settings in
# _config_.
def initialize(config = {})
@config = Config.merge(Config::DEFAULT[:css], config[:css] || config)
@at_rules = Set.new(@config[:at_rules])
@at_rules_with_properties = Set.new(@config[:at_rules_with_properties])
@at_rules_with_styles = Set.new(@config[:at_rules_with_styles])
@import_url_validator = @config[:import_url_validator]
end
# Sanitizes inline CSS style properties.
#
# This is most useful for sanitizing non-stylesheet fragments of CSS like
# you would find in the `style` attribute of an HTML element. To sanitize a
# full CSS stylesheet, use {#stylesheet}.
#
# @example
# scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
# scss.properties("background: url(foo.png); color: #fff;")
#
# @return [String] Sanitized CSS properties.
def properties(css)
tree = Crass.parse_properties(css,
preserve_comments: @config[:allow_comments],
preserve_hacks: @config[:allow_hacks])
tree!(tree)
Crass::Parser.stringify(tree)
end
# Sanitizes a full CSS stylesheet.
#
# A stylesheet may include selectors, at-rules, and comments. To sanitize
# only inline style properties such as the contents of an HTML `style`
# attribute, use {#properties}.
#
# @example
# css = %[
# .foo {
# background: url(foo.png);
# color: #fff;
# }
#
# #bar {
# font: 42pt 'Comic Sans MS';
# }
# ]
#
# scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
# scss.stylesheet(css)
#
# @return [String] Sanitized CSS stylesheet.
def stylesheet(css)
tree = Crass.parse(css,
preserve_comments: @config[:allow_comments],
preserve_hacks: @config[:allow_hacks])
tree!(tree)
Crass::Parser.stringify(tree)
end
# Sanitizes the given Crass CSS parse tree and all its children, modifying
# it in place.
#
# @example
# css = %[
# .foo {
# background: url(foo.png);
# color: #fff;
# }
#
# #bar {
# font: 42pt 'Comic Sans MS';
# }
# ]
#
# scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
# tree = Crass.parse(css)
#
# scss.tree!(tree)
#
# @return [Array] Sanitized Crass CSS parse tree.
def tree!(tree)
preceded_by_property = false
tree.map! do |node|
next nil if node.nil?
case node[:node]
when :at_rule
preceded_by_property = false
next at_rule!(node)
when :comment
next node if @config[:allow_comments]
when :property
prop = property!(node)
preceded_by_property = !prop.nil?
next prop
when :semicolon
# Only preserve the semicolon if it was preceded by an allowlisted
# property. Otherwise, omit it in order to prevent redundant
# semicolons.
if preceded_by_property
preceded_by_property = false
next node
end
when :style_rule
preceded_by_property = false
tree!(node[:children])
next node
when :whitespace
next node
end
nil
end
tree
end
# -- Protected Instance Methods --------------------------------------------
protected
# Sanitizes a CSS at-rule node. Returns the sanitized node, or `nil` if the
# current config doesn't allow this at-rule.
def at_rule!(rule)
name = rule[:name].downcase
if @at_rules_with_styles.include?(name)
styles = Crass::Parser.parse_rules(rule[:block],
preserve_comments: @config[:allow_comments],
preserve_hacks: @config[:allow_hacks])
rule[:block] = tree!(styles)
elsif @at_rules_with_properties.include?(name)
props = Crass::Parser.parse_properties(rule[:block],
preserve_comments: @config[:allow_comments],
preserve_hacks: @config[:allow_hacks])
rule[:block] = tree!(props)
elsif @at_rules.include?(name)
return nil if name == "import" && !import_url_allowed?(rule)
return nil if rule.has_key?(:block)
else
return nil
end
rule
end
# Returns `true` if the given CSS function name is an image-related function
# that may contain image URLs that need to be validated.
def image_function?(name)
["image", "image-set", "-webkit-image-set"].include?(name)
end
# Passes the URL value of an @import rule to a block to ensure
# it's an allowed URL
def import_url_allowed?(rule)
return true unless @import_url_validator
url_token = rule[:tokens].detect { |t| t[:node] == :url || t[:node] == :string }
# don't allow @imports with no URL value
return false unless url_token && (import_url = url_token[:value])
@import_url_validator.call(import_url)
end
# Sanitizes a CSS property node. Returns the sanitized node, or `nil` if the
# current config doesn't allow this property.
def property!(prop)
name = prop[:name].downcase
# Preserve IE * and _ hacks if desired.
if @config[:allow_hacks]
name.slice!(0) if /\A[*_]/.match?(name)
end
return nil unless @config[:properties].include?(name)
nodes = prop[:children].dup
combined_value = +""
nodes.each do |child|
value = child[:value]
case child[:node]
when :ident
combined_value << value.downcase if String === value
when :function
if child.key?(:name)
name = child[:name].downcase
if name == "url"
return nil unless valid_url?(child)
end
if image_function?(name)
return nil unless valid_image?(child)
end
combined_value << name
return nil if name == "expression" || combined_value == "expression"
end
if Array === value
nodes.concat(value)
elsif String === value
lowercase_value = value.downcase
combined_value << lowercase_value
return nil if lowercase_value == "expression" || combined_value == "expression"
end
when :url
return nil unless valid_url?(child)
when :bad_url
return nil
end
end
prop
end
# Returns `true` if the given node (which may be of type `:url` or
# `:function`, since the CSS syntax can produce both) uses an allowlisted
# protocol.
def valid_url?(node)
type = node[:node]
if type == :function
return false unless node.key?(:name) && node[:name].downcase == "url"
return false unless Array === node[:value]
# A URL function's `:value` should be an array containing no more than
# one `:string` node and any number of `:whitespace` nodes.
#
# If it contains more than one `:string` node, or if it contains any
# other nodes except `:whitespace` nodes, it's not valid.
url_string_node = nil
node[:value].each do |token|
return false unless Hash === token
case token[:node]
when :string
return false unless url_string_node.nil?
url_string_node = token
when :whitespace
next
else
return false
end
end
return false if url_string_node.nil?
url = url_string_node[:value]
elsif type == :url
url = node[:value]
else
return false
end
if url =~ Sanitize::REGEX_PROTOCOL
@config[:protocols].include?($1.downcase)
else
@config[:protocols].include?(:relative)
end
end
# Returns `true` if the given node is an image-related function and contains
# only strings that use an allowlisted protocol.
def valid_image?(node)
return false unless node[:node] == :function
return false unless node.key?(:name) && image_function?(node[:name].downcase)
return false unless Array === node[:value]
node[:value].each do |token|
return false unless Hash === token
case token[:node]
when :string
if token[:value] =~ Sanitize::REGEX_PROTOCOL
return false unless @config[:protocols].include?($1.downcase)
else
return false unless @config[:protocols].include?(:relative)
end
else
next
end
end
end
end
end
sanitize-7.0.0/lib/sanitize/config/ 0000755 0000041 0000041 00000000000 14744072357 017252 5 ustar www-data www-data sanitize-7.0.0/lib/sanitize/config/default.rb 0000644 0000041 0000041 00000011265 14744072357 021230 0 ustar www-data www-data # frozen_string_literal: true
class Sanitize
module Config
DEFAULT = freeze_config(
# HTML attributes to add to specific elements. By default, no attributes
# are added.
add_attributes: {},
# Whether or not to allow HTML comments. Allowing comments is strongly
# discouraged, since IE allows script execution within conditional
# comments.
allow_comments: false,
# Whether or not to allow well-formed HTML doctype declarations such as
# "" when sanitizing a document. This setting is ignored
# when sanitizing fragments.
allow_doctype: false,
# HTML attributes to allow in specific elements. By default, no attributes
# are allowed. Use the symbol :data to indicate that arbitrary HTML5
# data-* attributes should be allowed.
attributes: {},
# CSS sanitization settings.
css: {
# Whether or not to allow CSS comments.
allow_comments: false,
# Whether or not to allow browser compatibility hacks such as the IE *
# and _ hacks. These are generally harmless, but technically result in
# invalid CSS.
allow_hacks: false,
# CSS at-rules to allow that may not have associated blocks (e.g.
# "import").
#
# https://developer.mozilla.org/en-US/docs/Web/CSS/At-rule
at_rules: [],
# CSS at-rules to allow whose blocks may contain properties (e.g.
# "font-face").
at_rules_with_properties: [],
# CSS at-rules to allow whose blocks may contain styles (e.g. "media").
at_rules_with_styles: [],
# CSS properties to allow.
properties: [],
# URL protocols to allow in CSS URLs.
protocols: []
},
# HTML elements to allow. By default, no elements are allowed (which means
# that all HTML will be stripped).
#
# Warning: Sanitize cannot safely sanitize the contents of foreign
# elements (elements in the MathML or SVG namespaces). Do not add `math`
# or `svg` to this list! If you do, you may create a security
# vulnerability in your application.
elements: [],
# HTML parsing options to pass to Nokogumbo.
# https://github.com/rubys/nokogumbo/tree/v2.0.1#parsing-options
parser_options: {},
# URL handling protocols to allow in specific attributes. By default, no
# protocols are allowed. Use :relative in place of a protocol if you want
# to allow relative URLs sans protocol.
protocols: {},
# If this is true, Sanitize will remove the contents of any filtered
# elements in addition to the elements themselves. By default, Sanitize
# leaves the safe parts of an element's contents behind when the element
# is removed.
#
# If this is an Array or Set of element names, then only the contents of
# the specified elements (when filtered) will be removed, and the contents
# of all other filtered elements will be left behind.
remove_contents: %w[
iframe math noembed noframes noscript plaintext script style svg xmp
],
# Transformers allow you to filter or alter nodes using custom logic. See
# README.md for details and examples.
transformers: [],
# Elements which, when removed, should have their contents surrounded by
# values specified with `before` and `after` keys to preserve readability.
# For example, `foo
bar
baz` will become 'foo bar baz' when the
#
is removed.
whitespace_elements: {
"address" => {before: " ", after: " "},
"article" => {before: " ", after: " "},
"aside" => {before: " ", after: " "},
"blockquote" => {before: " ", after: " "},
"br" => {before: " ", after: " "},
"dd" => {before: " ", after: " "},
"div" => {before: " ", after: " "},
"dl" => {before: " ", after: " "},
"dt" => {before: " ", after: " "},
"footer" => {before: " ", after: " "},
"h1" => {before: " ", after: " "},
"h2" => {before: " ", after: " "},
"h3" => {before: " ", after: " "},
"h4" => {before: " ", after: " "},
"h5" => {before: " ", after: " "},
"h6" => {before: " ", after: " "},
"header" => {before: " ", after: " "},
"hgroup" => {before: " ", after: " "},
"hr" => {before: " ", after: " "},
"li" => {before: " ", after: " "},
"nav" => {before: " ", after: " "},
"ol" => {before: " ", after: " "},
"p" => {before: " ", after: " "},
"pre" => {before: " ", after: " "},
"section" => {before: " ", after: " "},
"ul" => {before: " ", after: " "}
}
)
end
end
sanitize-7.0.0/lib/sanitize/config/relaxed.rb 0000644 0000041 0000041 00000054754 14744072357 021242 0 ustar www-data www-data # frozen_string_literal: true
class Sanitize
module Config
RELAXED = freeze_config(
elements: BASIC[:elements] + %w[
address article aside bdi bdo body caption col colgroup data del div
figcaption figure footer h1 h2 h3 h4 h5 h6 head header hgroup hr html
img ins main nav rp rt ruby section span style summary table tbody
td tfoot th thead title tr wbr
],
allow_doctype: true,
attributes: merge(BASIC[:attributes],
:all => %w[class dir hidden id lang style tabindex title translate],
"a" => %w[href hreflang name rel],
"col" => %w[span width],
"colgroup" => %w[span width],
"data" => %w[value],
"del" => %w[cite datetime],
"img" => %w[align alt border height src srcset width],
"ins" => %w[cite datetime],
"li" => %w[value],
"ol" => %w[reversed start type],
"style" => %w[media scoped type],
"table" => %w[align bgcolor border cellpadding cellspacing frame rules sortable summary width],
"td" => %w[abbr align axis colspan headers rowspan valign width],
"th" => %w[abbr align axis colspan headers rowspan scope sorted valign width],
"ul" => %w[type]),
protocols: merge(BASIC[:protocols],
"del" => {"cite" => ["http", "https", :relative]},
"img" => {"src" => ["http", "https", :relative]},
"ins" => {"cite" => ["http", "https", :relative]}),
css: {
allow_comments: true,
allow_hacks: true,
at_rules_with_properties: %w[
bottom-center
bottom-left
bottom-left-corner
bottom-right
bottom-right-corner
font-face
left-bottom
left-middle
left-top
page
right-bottom
right-middle
right-top
top-center
top-left
top-left-corner
top-right
top-right-corner
],
at_rules_with_styles: %w[
-moz-keyframes
-o-keyframes
-webkit-keyframes
container
document
keyframes
media
supports
],
protocols: ["http", "https", :relative],
properties: %w[
-moz-appearance
-moz-background-inline-policy
-moz-box-sizing
-moz-column-count
-moz-column-fill
-moz-column-gap
-moz-column-rule
-moz-column-rule-color
-moz-column-rule-style
-moz-column-rule-width
-moz-column-width
-moz-font-feature-settings
-moz-font-language-override
-moz-hyphens
-moz-text-align-last
-moz-text-decoration-color
-moz-text-decoration-line
-moz-text-decoration-style
-moz-text-size-adjust
-ms-background-position-x
-ms-background-position-y
-ms-block-progression
-ms-content-zoom-chaining
-ms-content-zoom-limit
-ms-content-zoom-limit-max
-ms-content-zoom-limit-min
-ms-content-zoom-snap
-ms-content-zoom-snap-points
-ms-content-zoom-snap-type
-ms-content-zooming
-ms-filter
-ms-flex
-ms-flex-align
-ms-flex-direction
-ms-flex-order
-ms-flex-pack
-ms-flex-wrap
-ms-flow-from
-ms-flow-into
-ms-grid-column
-ms-grid-column-align
-ms-grid-column-span
-ms-grid-columns
-ms-grid-row
-ms-grid-row-align
-ms-grid-row-span
-ms-grid-rows
-ms-high-contrast-adjust
-ms-hyphenate-limit-chars
-ms-hyphenate-limit-lines
-ms-hyphenate-limit-zone
-ms-hyphens
-ms-ime-mode
-ms-interpolation-mode
-ms-layout-flow
-ms-layout-grid
-ms-layout-grid-char
-ms-layout-grid-line
-ms-layout-grid-mode
-ms-layout-grid-type
-ms-overflow-style
-ms-overflow-x
-ms-overflow-y
-ms-progress-appearance
-ms-scroll-chaining
-ms-scroll-limit
-ms-scroll-limit-x-max
-ms-scroll-limit-x-min
-ms-scroll-limit-y-max
-ms-scroll-limit-y-min
-ms-scroll-rails
-ms-scroll-snap-points-x
-ms-scroll-snap-points-y
-ms-scroll-snap-type
-ms-scroll-snap-x
-ms-scroll-snap-y
-ms-scroll-translation
-ms-scrollbar-arrow-color
-ms-scrollbar-base-color
-ms-scrollbar-darkshadow-color
-ms-scrollbar-face-color
-ms-scrollbar-highlight-color
-ms-scrollbar-shadow-color
-ms-scrollbar-track-color
-ms-text-align-last
-ms-text-autospace
-ms-text-justify
-ms-text-kashida-space
-ms-text-overflow
-ms-text-size-adjust
-ms-text-underline-position
-ms-touch-action
-ms-user-select
-ms-word-break
-ms-word-wrap
-ms-wrap-flow
-ms-wrap-margin
-ms-wrap-through
-ms-writing-mode
-ms-zoom
-webkit-align-content
-webkit-align-items
-webkit-align-self
-webkit-animation
-webkit-animation-delay
-webkit-animation-direction
-webkit-animation-duration
-webkit-animation-fill-mode
-webkit-animation-iteration-count
-webkit-animation-name
-webkit-animation-play-state
-webkit-animation-timing-function
-webkit-appearance
-webkit-backface-visibility
-webkit-background-blend-mode
-webkit-background-clip
-webkit-background-composite
-webkit-background-origin
-webkit-background-size
-webkit-blend-mode
-webkit-border-after
-webkit-border-after-color
-webkit-border-after-style
-webkit-border-after-width
-webkit-border-before
-webkit-border-before-color
-webkit-border-before-style
-webkit-border-before-width
-webkit-border-bottom-left-radius
-webkit-border-bottom-right-radius
-webkit-border-end
-webkit-border-end-color
-webkit-border-end-style
-webkit-border-end-width
-webkit-border-fit
-webkit-border-image
-webkit-border-radius
-webkit-border-start
-webkit-border-start-color
-webkit-border-start-style
-webkit-border-start-width
-webkit-border-top-left-radius
-webkit-border-top-right-radius
-webkit-box-align
-webkit-box-decoration-break
-webkit-box-flex
-webkit-box-flex-group
-webkit-box-lines
-webkit-box-ordinal-group
-webkit-box-orient
-webkit-box-pack
-webkit-box-reflect
-webkit-box-shadow
-webkit-box-sizing
-webkit-clip-path
-webkit-column-axis
-webkit-column-break-after
-webkit-column-break-before
-webkit-column-break-inside
-webkit-column-count
-webkit-column-gap
-webkit-column-progression
-webkit-column-rule
-webkit-column-rule-color
-webkit-column-rule-style
-webkit-column-rule-width
-webkit-column-span
-webkit-column-width
-webkit-columns
-webkit-filter
-webkit-flex
-webkit-flex-basis
-webkit-flex-direction
-webkit-flex-flow
-webkit-flex-grow
-webkit-flex-shrink
-webkit-flex-wrap
-webkit-flow-from
-webkit-flow-into
-webkit-font-size-delta
-webkit-font-smoothing
-webkit-grid-area
-webkit-grid-auto-columns
-webkit-grid-auto-flow
-webkit-grid-auto-rows
-webkit-grid-column
-webkit-grid-column-end
-webkit-grid-column-start
-webkit-grid-definition-columns
-webkit-grid-definition-rows
-webkit-grid-row
-webkit-grid-row-end
-webkit-grid-row-start
-webkit-justify-content
-webkit-line-clamp
-webkit-logical-height
-webkit-logical-width
-webkit-margin-after
-webkit-margin-after-collapse
-webkit-margin-before
-webkit-margin-before-collapse
-webkit-margin-bottom-collapse
-webkit-margin-collapse
-webkit-margin-end
-webkit-margin-start
-webkit-margin-top-collapse
-webkit-marquee
-webkit-marquee-direction
-webkit-marquee-increment
-webkit-marquee-repetition
-webkit-marquee-speed
-webkit-marquee-style
-webkit-mask
-webkit-mask-box-image
-webkit-mask-box-image-outset
-webkit-mask-box-image-repeat
-webkit-mask-box-image-slice
-webkit-mask-box-image-source
-webkit-mask-box-image-width
-webkit-mask-clip
-webkit-mask-composite
-webkit-mask-image
-webkit-mask-origin
-webkit-mask-position
-webkit-mask-position-x
-webkit-mask-position-y
-webkit-mask-repeat
-webkit-mask-repeat-x
-webkit-mask-repeat-y
-webkit-mask-size
-webkit-mask-source-type
-webkit-max-logical-height
-webkit-max-logical-width
-webkit-min-logical-height
-webkit-min-logical-width
-webkit-opacity
-webkit-order
-webkit-padding-after
-webkit-padding-before
-webkit-padding-end
-webkit-padding-start
-webkit-perspective
-webkit-perspective-origin
-webkit-perspective-origin-x
-webkit-perspective-origin-y
-webkit-region-break-after
-webkit-region-break-before
-webkit-region-break-inside
-webkit-region-fragment
-webkit-shape-inside
-webkit-shape-margin
-webkit-shape-outside
-webkit-shape-padding
-webkit-svg-shadow
-webkit-tap-highlight-color
-webkit-text-decoration
-webkit-text-decoration-color
-webkit-text-decoration-line
-webkit-text-decoration-style
-webkit-text-fill-color
-webkit-text-size-adjust
-webkit-touch-callout
-webkit-transform
-webkit-transform-origin
-webkit-transform-origin-x
-webkit-transform-origin-y
-webkit-transform-origin-z
-webkit-transform-style
-webkit-transition
-webkit-transition-delay
-webkit-transition-duration
-webkit-transition-property
-webkit-transition-timing-function
-webkit-user-drag
-webkit-wrap-flow
-webkit-wrap-through
accent-color
align-content
align-items
align-self
alignment-adjust
alignment-baseline
all
anchor-point
anchor-name
anchor-scope
animation
animation-composition
animation-delay
animation-direction
animation-duration
animation-fill-mode
animation-iteration-count
animation-name
animation-play-state
animation-range
animation-range-end
animation-range-start
animation-timeline
animation-timing-function
appearance
aspect-ratio
azimuth
backface-visibility
background
background-attachment
background-blend-mode
background-clip
background-color
background-image
background-origin
background-position
background-repeat
background-size
baseline-shift
binding
bleed
baseline-source
block-ellipsis
block-size
bookmark-label
bookmark-level
bookmark-state
border
border-block
border-block-color
border-block-end
border-block-end-color
border-block-end-style
border-block-end-width
border-block-start
border-block-start-color
border-block-start-style
border-block-start-width
border-block-style
border-block-width
border-bottom
border-bottom-color
border-bottom-left-radius
border-bottom-right-radius
border-bottom-style
border-bottom-width
border-boundary
border-collapse
border-color
border-end-end-radius
border-end-start-radius
border-image
border-image-outset
border-image-repeat
border-image-slice
border-image-source
border-image-width
border-inline
border-inline-color
border-inline-end
border-inline-end-color
border-inline-end-style
border-inline-end-width
border-inline-start
border-inline-start-color
border-inline-start-style
border-inline-start-width
border-inline-style
border-inline-width
border-left
border-left-color
border-left-style
border-left-width
border-radius
border-right
border-right-color
border-right-style
border-right-width
border-spacing
border-start-end-radius
border-start-start-radius
border-style
border-top
border-top-color
border-top-left-radius
border-top-right-radius
border-top-style
border-top-width
border-width
bottom
box-decoration-break
box-shadow
box-sizing
box-snap
box-suppress
break-after
break-before
break-inside
caption-side
chains
caret
caret-color
caret-shape
clear
clip
clip-path
clip-rule
color
color-interpolation
color-adjust
color-interpolation-filters
color-profile
color-rendering
color-scheme
column-count
column-fill
column-gap
column-rule
column-rule-color
column-rule-style
column-rule-width
column-span
column-width
columns
contain
contain-intrinsic-block-size
contain-intrinsic-height
contain-intrinsic-inline-size
contain-intrinsic-size
contain-intrinsic-width
container
container-name
container-type
content
content-visibility
continue
counter-increment
counter-reset
counter-set
crop
cue
cue-after
cue-before
cursor
direction
display
display-inside
display-list
display-outside
dominant-baseline
elevation
empty-cells
enable-background
fill
fill-opacity
fill-rule
filter
flex
flex-basis
flex-direction
flex-flow
flex-grow
flex-shrink
flex-wrap
float
float-offset
flood-color
flood-opacity
flow-from
flow-into
font
font-family
font-feature-settings
font-kerning
font-language-override
font-optical-sizing
font-palette
font-size
font-size-adjust
font-stretch
font-style
font-synthesis
font-synthesis-position
font-synthesis-small-caps
font-synthesis-style
font-synthesis-weight
font-variant
font-variant-alternates
font-variant-caps
font-variant-east-asian
font-variant-emoji
font-variant-ligatures
font-variant-numeric
font-variant-position
font-variation-settings
font-weight
glyph-orientation-horizontal
font-width
footnote-display
footnote-policy
forced-color-adjust
gap
glyph-orientation-vertical
grid
grid-area
grid-auto-columns
grid-auto-flow
grid-auto-rows
grid-column
grid-column-end
grid-column-start
grid-row
grid-row-end
grid-row-start
grid-template
grid-template-areas
grid-template-columns
grid-template-rows
hanging-punctuation
height
hyphens
icon
image-orientation
image-rendering
image-resolution
ime-mode
initial-letters
inline-box-align
justify-content
justify-items
justify-self
kerning
left
letter-spacing
lighting-color
line-box-contain
line-break
line-grid
line-height
line-snap
line-stacking
line-stacking-ruby
line-stacking-shift
line-stacking-strategy
list-style
list-style-image
list-style-position
list-style-type
margin
margin-bottom
margin-left
margin-right
margin-top
marker
marker-end
marker-mid
marker-offset
marker-side
marker-start
marks
mask
mask-box
mask-box-outset
mask-box-repeat
mask-box-slice
mask-box-source
mask-box-width
mask-clip
mask-image
mask-origin
mask-position
mask-repeat
mask-size
mask-source-type
mask-type
max-height
max-lines
max-width
min-height
min-width
move-to
nav-down
nav-index
nav-left
nav-right
nav-up
object-fit
object-position
opacity
order
orphans
outline
outline-color
outline-offset
outline-style
outline-width
overflow
overflow-wrap
overflow-x
overflow-y
padding
padding-bottom
padding-left
padding-right
padding-top
page
page-break-after
page-break-before
page-break-inside
page-policy
pause
pause-after
pause-before
perspective
perspective-origin
pitch
pitch-range
play-during
pointer-events
position
presentation-level
quotes
region-fragment
resize
rest
rest-after
rest-before
richness
right
rotation
rotation-point
ruby-align
ruby-merge
ruby-position
shape-image-threshold
shape-margin
shape-outside
shape-rendering
size
speak
speak-as
speak-header
speak-numeral
speak-punctuation
speech-rate
stop-color
stop-opacity
stress
string-set
stroke
stroke-dasharray
stroke-dashoffset
stroke-linecap
stroke-linejoin
stroke-miterlimit
stroke-opacity
stroke-width
tab-size
table-layout
text-align
text-align-all
text-align-last
text-anchor
text-combine-horizontal
text-autospace
text-box
text-box-edge
text-box-trim
text-combine-upright
text-decoration
text-decoration-color
text-decoration-line
text-decoration-skip
text-decoration-skip-box
text-decoration-skip-ink
text-decoration-skip-inset
text-decoration-skip-self
text-decoration-skip-spaces
text-decoration-style
text-decoration-thickness
text-emphasis
text-emphasis-color
text-emphasis-position
text-emphasis-skip
text-emphasis-style
text-height
text-group-align
text-indent
text-justify
text-orientation
text-overflow
text-rendering
text-shadow
text-size-adjust
text-space-collapse
text-spacing
text-spacing-trim
text-transform
text-underline-offset
text-underline-position
text-wrap
text-wrap-mode
text-wrap-style
timeline-scope
top
touch-action
transform
transform-box
transform-origin
transform-style
transition
transition-delay
transition-duration
transition-property
transition-timing-function
translate
unicode-bidi
unicode-range
user-select
vertical-align
view-timeline
view-timeline-axis
view-timeline-inset
view-timeline-name
view-transition-class
view-transition-group
view-transition-name
visibility
voice-balance
voice-duration
voice-family
voice-pitch
voice-range
voice-rate
voice-stress
voice-volume
volume
white-space
white-space-collapse
white-space-trim
widows
width
will-change
word-break
word-space-transform
word-spacing
word-wrap
wrap-after
wrap-before
wrap-flow
wrap-inside
wrap-through
writing-mode
z-index
]
}
)
end
end
sanitize-7.0.0/lib/sanitize/config/basic.rb 0000644 0000041 0000041 00000001422 14744072357 020657 0 ustar www-data www-data # frozen_string_literal: true
class Sanitize
module Config
BASIC = freeze_config(
elements: RESTRICTED[:elements] + %w[
a abbr blockquote br cite code dd dfn dl dt kbd li mark ol p pre q s
samp small strike sub sup time ul var
],
attributes: {
"a" => %w[href],
"abbr" => %w[title],
"blockquote" => %w[cite],
"dfn" => %w[title],
"q" => %w[cite],
"time" => %w[datetime pubdate]
},
add_attributes: {
"a" => {"rel" => "nofollow"}
},
protocols: {
"a" => {"href" => ["ftp", "http", "https", "mailto", :relative]},
"blockquote" => {"cite" => ["http", "https", :relative]},
"q" => {"cite" => ["http", "https", :relative]}
}
)
end
end
sanitize-7.0.0/lib/sanitize/config/restricted.rb 0000644 0000041 0000041 00000000222 14744072357 021743 0 ustar www-data www-data # frozen_string_literal: true
class Sanitize
module Config
RESTRICTED = freeze_config(
elements: %w[b em i strong u]
)
end
end
sanitize-7.0.0/lib/sanitize/transformers/ 0000755 0000041 0000041 00000000000 14744072357 020532 5 ustar www-data www-data sanitize-7.0.0/lib/sanitize/transformers/clean_comment.rb 0000644 0000041 0000041 00000000404 14744072357 023661 0 ustar www-data www-data # frozen_string_literal: true
class Sanitize
module Transformers
CleanComment = lambda do |env|
node = env[:node]
if node.type == Nokogiri::XML::Node::COMMENT_NODE
node.unlink unless env[:is_allowlisted]
end
end
end
end
sanitize-7.0.0/lib/sanitize/transformers/clean_element.rb 0000644 0000041 0000041 00000023116 14744072357 023655 0 ustar www-data www-data # frozen_string_literal: true
require "cgi"
require "set"
class Sanitize
module Transformers
class CleanElement
# Matches a valid HTML5 data attribute name. The unicode ranges included
# here are a conservative subset of the full range of characters that are
# technically allowed, with the intent of matching the most common
# characters used in data attribute names while excluding uncommon or
# potentially misleading characters, or characters with the potential to
# be normalized into unsafe or confusing forms.
#
# If you need data attr names with characters that aren't included here
# (such as combining marks, full-width characters, or CJK), please
# consider creating a custom transformer to validate attributes according
# to your needs.
#
# https://html.spec.whatwg.org/multipage/dom.html#embedding-custom-non-visible-data-with-the-data-*-attributes
REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
# Elements whose content is treated as unescaped text by HTML parsers.
UNESCAPED_TEXT_ELEMENTS = Set.new(%w[
iframe
noembed
noframes
noscript
plaintext
script
style
xmp
])
# Attributes that need additional escaping on `` elements due to unsafe
# libxml2 behavior.
UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
name
])
# Attributes that need additional escaping on all elements due to unsafe
# libxml2 behavior.
UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
action
href
src
])
# Mapping of original characters to escape sequences for characters that
# should be escaped in attributes affected by unsafe libxml2 behavior.
UNSAFE_LIBXML_ESCAPE_CHARS = {
" " => "%20",
'"' => "%22"
}
# Regex that matches any single character that needs to be escaped in
# attributes affected by unsafe libxml2 behavior.
UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
def initialize(config)
@add_attributes = config[:add_attributes]
@attributes = config[:attributes].dup
@elements = config[:elements]
@protocols = config[:protocols]
@remove_all_contents = false
@remove_element_contents = Set.new
@whitespace_elements = {}
@attributes.each do |element_name, attrs|
unless element_name == :all
@attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
end
end
# Backcompat: if :whitespace_elements is a Set, convert it to a hash.
if config[:whitespace_elements].is_a?(Set)
config[:whitespace_elements].each do |element|
@whitespace_elements[element] = {before: " ", after: " "}
end
else
@whitespace_elements = config[:whitespace_elements]
end
if config[:remove_contents].is_a?(Enumerable)
@remove_element_contents.merge(config[:remove_contents].map(&:to_s))
else
@remove_all_contents = !!config[:remove_contents]
end
end
def call(env)
node = env[:node]
return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
name = env[:node_name]
# Delete any element that isn't in the config allowlist, unless the node
# has already been deleted from the document.
#
# It's important that we not try to reparent the children of a node that
# has already been deleted, since that seems to trigger a memory leak in
# Nokogiri.
unless @elements.include?(name) || node.parent.nil?
# Elements like br, div, p, etc. need to be replaced with whitespace
# in order to preserve readability.
if @whitespace_elements.include?(name)
node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
unless node.children.empty?
node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
end
end
unless node.children.empty?
unless @remove_all_contents || @remove_element_contents.include?(name)
node.add_previous_sibling(node.children)
end
end
node.unlink
return
end
attr_allowlist = @attributes[name] || @attributes[:all]
if attr_allowlist.nil?
# Delete all attributes from elements with no allowlisted attributes.
node.attribute_nodes.each { |attr| attr.unlink }
else
allow_data_attributes = attr_allowlist.include?(:data)
# Delete any attribute that isn't allowed on this element.
node.attribute_nodes.each do |attr|
attr_name = attr.name.downcase
unless attr_allowlist.include?(attr_name)
# The attribute isn't in the allowlist, but may still be allowed
# if it's a data attribute.
unless allow_data_attributes && attr_name.start_with?("data-") && attr_name =~ REGEX_DATA_ATTR
# Either the attribute isn't a data attribute or arbitrary data
# attributes aren't allowed. Remove the attribute.
attr.unlink
next
end
end
# The attribute is allowed.
# Remove any attributes that use unacceptable protocols.
if @protocols.include?(name) && @protocols[name].include?(attr_name)
attr_protocols = @protocols[name][attr_name]
if attr.value =~ REGEX_PROTOCOL
unless attr_protocols.include?($1.downcase)
attr.unlink
next
end
else
unless attr_protocols.include?(:relative)
attr.unlink
next
end
end
# Leading and trailing whitespace around URLs is ignored at parse
# time. Stripping it here prevents it from being escaped by the
# libxml2 workaround below.
attr.value = attr.value.strip
end
# libxml2 >= 2.9.2 doesn't escape comments within some attributes,
# in an attempt to preserve server-side includes. This can result in
# XSS since an unescaped double quote can allow an attacker to
# inject a non-allowlisted attribute.
#
# Sanitize works around this by implementing its own escaping for
# affected attributes, some of which can exist on any element and
# some of which can only exist on `` elements.
#
# This fix is technically no longer necessary with Nokogumbo >= 2.0
# since it no longer uses libxml2's serializer, but it's retained to
# avoid breaking use cases where people might be sanitizing
# individual Nokogiri nodes and then serializing them manually
# without Nokogumbo.
#
# The relevant libxml2 code is here:
#
if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
(name == "a" && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
end
end
end
# Add required attributes.
if @add_attributes.include?(name)
@add_attributes[name].each { |key, val| node[key] = val }
end
# Element-specific special cases.
case name
# If this is an allowlisted iframe that has children, remove all its
# children. The HTML standard says iframes shouldn't have content, but
# when they do, this content is parsed as text and is serialized
# verbatim without being escaped, which is unsafe because legacy
# browsers may still render it and execute `")).must_equal ""
_(Sanitize.fragment("", allow_comments: false, elements: ["script"]))
.must_equal ""
end
end
describe "when :allow_comments is true" do
before do
@s = Sanitize.new(allow_comments: true, elements: ["div"])
end
it "should allow comments" do
_(@s.fragment("foo bar")).must_equal "foo bar"
_(@s.fragment("foo "
_(@s.fragment("foo "
_(@s.fragment("foo bar")).must_equal "foo bar"
_(@s.fragment("foo --> -->bar")).must_equal "foo --> -->bar"
_(@s.fragment("foo
"
_(Sanitize.fragment("", allow_comments: true, elements: ["script"]))
.must_equal ""
end
end
end
sanitize-7.0.0/test/test_sanitize.rb 0000644 0000041 0000041 00000017273 14744072357 017614 0 ustar www-data www-data # frozen_string_literal: true
require_relative "common"
describe "Sanitize" do
describe "initializer" do
it "should not modify a transformers array in the given config" do
transformers = [
lambda {}
]
Sanitize.new({transformers: transformers})
_(transformers.length).must_equal(1)
end
end
describe "instance methods" do
before do
@s = Sanitize.new
end
describe "#document" do
before do
@s = Sanitize.new(elements: ["html"])
end
it "should sanitize an HTML document" do
_(@s.document('Loremipsumdolor sit amet '))
.must_equal "Lorem ipsum dolor sit amet "
end
it "should not modify the input string" do
input = "foo"
@s.document(input)
_(input).must_equal("foo")
end
it "should not choke on frozen documents" do
_(@s.document("foo")).must_equal "foo"
end
it "should normalize newlines" do
_(@s.document("a\r\n\n\r\r\r\nz")).must_equal "a\n\n\n\n\nz"
end
it "should strip control characters (except ASCII whitespace)" do
sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
whitespace = "\t\n\f\u0020"
_(@s.document("a#{sample_control_chars}#{whitespace}z")).must_equal "a#{whitespace}z"
end
it "should strip non-characters" do
sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
_(@s.document("a#{sample_non_chars}z")).must_equal "az"
end
describe "when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH" do
let(:content) do
content = nest_html_content("foo", Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
"#{content}"
end
it "raises an ArgumentError exception" do
assert_raises ArgumentError do
@s.document(content)
end
end
describe "and :max_tree_depth of -1 is supplied in :parser_options" do
before do
@s = Sanitize.new(elements: ["html"], parser_options: {max_tree_depth: -1})
end
it "does not raise an ArgumentError exception" do
_(@s.document(content)).must_equal "foo"
end
end
end
end
describe "#fragment" do
it "should sanitize an HTML fragment" do
_(@s.fragment('Loremipsumdolor sit amet '))
.must_equal "Lorem ipsum dolor sit amet "
end
it "should not modify the input string" do
input = "foo"
@s.fragment(input)
_(input).must_equal "foo"
end
it "should not choke on fragments containing or " do
_(@s.fragment("foo")).must_equal "foo"
_(@s.fragment("foo")).must_equal "foo"
_(@s.fragment("foo")).must_equal "foo"
_(@s.fragment("foo")).must_equal "foo"
end
it "should not choke on frozen fragments" do
_(@s.fragment("foo")).must_equal "foo"
end
it "should normalize newlines" do
_(@s.fragment("a\r\n\n\r\r\r\nz")).must_equal "a\n\n\n\n\nz"
end
it "should strip control characters (except ASCII whitespace)" do
sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
whitespace = "\t\n\f\u0020"
_(@s.fragment("a#{sample_control_chars}#{whitespace}z")).must_equal "a#{whitespace}z"
end
it "should strip non-characters" do
sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
_(@s.fragment("a#{sample_non_chars}z")).must_equal "az"
end
describe "when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH" do
let(:content) do
content = nest_html_content("foo", Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
"#{content}"
end
it "raises an ArgumentError exception" do
assert_raises ArgumentError do
@s.fragment(content)
end
end
describe "and :max_tree_depth of -1 is supplied in :parser_options" do
before do
@s = Sanitize.new(parser_options: {max_tree_depth: -1})
end
it "does not raise an ArgumentError exception" do
_(@s.fragment(content)).must_equal "foo"
end
end
end
end
describe "#node!" do
it "should sanitize a Nokogiri::XML::Node" do
doc = Nokogiri::HTML5.parse('Loremipsumdolor sit amet ')
frag = doc.fragment
doc.xpath("/html/body/node()").each { |node| frag << node }
@s.node!(frag)
_(frag.to_html).must_equal "Lorem ipsum dolor sit amet "
end
describe "when the given node is a document and isn't allowlisted" do
it "should raise a Sanitize::Error" do
doc = Nokogiri::HTML5.parse("foo")
_(proc { @s.node!(doc) }).must_raise Sanitize::Error
end
end
end
end
describe "class methods" do
describe ".document" do
it "should sanitize an HTML document with the given config" do
html = 'Loremipsumdolor sit amet '
_(Sanitize.document(html, elements: ["html"]))
.must_equal "Lorem ipsum dolor sit amet "
end
end
describe ".fragment" do
it "should sanitize an HTML fragment with the given config" do
html = 'Loremipsumdolor sit amet '
_(Sanitize.fragment(html, elements: ["strong"]))
.must_equal "Lorem ipsum dolor sit amet "
end
end
describe ".node!" do
it "should sanitize a Nokogiri::XML::Node with the given config" do
doc = Nokogiri::HTML5.parse('Loremipsumdolor sit amet ')
frag = doc.fragment
doc.xpath("/html/body/node()").each { |node| frag << node }
Sanitize.node!(frag, elements: ["strong"])
_(frag.to_html).must_equal "Lorem ipsum dolor sit amet "
end
end
end
private
def nest_html_content(html_content, depth)
"#{"" * depth}#{html_content}#{"" * depth}"
end
end
sanitize-7.0.0/test/common.rb 0000644 0000041 0000041 00000000115 14744072357 016202 0 ustar www-data www-data # frozen_string_literal: true
require "minitest/autorun"
require "sanitize"
sanitize-7.0.0/test/test_malicious_css.rb 0000644 0000041 0000041 00000003352 14744072357 020614 0 ustar www-data www-data # frozen_string_literal: true
require_relative "common"
# Miscellaneous attempts to sneak maliciously crafted CSS past Sanitize. Some of
# these are courtesy of (or inspired by) the OWASP XSS Filter Evasion Cheat
# Sheet.
#
# https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet
describe "Malicious CSS" do
make_my_diffs_pretty!
parallelize_me!
before do
@s = Sanitize::CSS.new(Sanitize::Config::RELAXED)
end
it "should not be possible to inject an expression by munging it with a comment" do
_(@s.properties(%[width:expr/*XSS*/ession(alert('XSS'))]))
.must_equal ""
_(@s.properties(%[width:ex/*XSS*//*/*/pression(alert("XSS"))]))
.must_equal ""
end
it "should not be possible to inject an expression by munging it with a newline" do
_(@s.properties(%[width:\nexpression(alert('XSS'));]))
.must_equal ""
end
it "should not allow the javascript protocol" do
_(@s.properties(%[background-image:url("javascript:alert('XSS')");]))
.must_equal ""
_(Sanitize.fragment(%[
],
Sanitize::Config::RELAXED)).must_equal ""
end
it "should not allow behaviors" do
_(@s.properties(%[behavior: url(xss.htc);])).must_equal ""
end
describe "sanitization bypass via CSS at-rule in HTML ],
@s.fragment(%[])
)
end
end
end
sanitize-7.0.0/test/test_malicious_html.rb 0000644 0000041 0000041 00000027413 14744072357 020774 0 ustar www-data www-data # frozen_string_literal: true
require_relative "common"
# Miscellaneous attempts to sneak maliciously crafted HTML past Sanitize. Many
# of these are courtesy of (or inspired by) the OWASP XSS Filter Evasion Cheat
# Sheet.
#
# https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet
describe "Malicious HTML" do
make_my_diffs_pretty!
parallelize_me!
before do
@s = Sanitize.new(Sanitize::Config::RELAXED)
end
describe "comments" do
it "should not allow script injection via conditional comments" do
_(@s.fragment(%[]))
.must_equal ""
end
end
describe "interpolation (ERB, PHP, etc.)" do
it "should escape ERB-style tags" do
_(@s.fragment("<% naughty_ruby_code %>"))
.must_equal "<% naughty_ruby_code %>"
_(@s.fragment("<%= naughty_ruby_code %>"))
.must_equal "<%= naughty_ruby_code %>"
end
it "should remove PHP-style tags" do
_(@s.fragment(" naughtyPHPCode(); ?>"))
.must_equal ""
_(@s.fragment("= naughtyPHPCode(); ?>"))
.must_equal ""
end
end
describe "" do
it "should not be possible to inject JS via a malformed event attribute" do
_(@s.document(''))
.must_equal ""
end
end
describe "