sanitize-7.0.0/0000755000004100000410000000000014744072357013411 5ustar www-datawww-datasanitize-7.0.0/lib/0000755000004100000410000000000014744072357014157 5ustar www-datawww-datasanitize-7.0.0/lib/sanitize.rb0000644000004100000410000001763514744072357016346 0ustar www-datawww-data# frozen_string_literal: true require "nokogiri" require "set" require_relative "sanitize/version" require_relative "sanitize/config" require_relative "sanitize/config/default" require_relative "sanitize/config/restricted" require_relative "sanitize/config/basic" require_relative "sanitize/config/relaxed" require_relative "sanitize/css" require_relative "sanitize/transformers/clean_cdata" require_relative "sanitize/transformers/clean_comment" require_relative "sanitize/transformers/clean_css" require_relative "sanitize/transformers/clean_doctype" require_relative "sanitize/transformers/clean_element" class Sanitize attr_reader :config # Matches one or more control characters that should be removed from HTML # before parsing, as defined by the HTML living standard. # # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream # - https://infra.spec.whatwg.org/#control REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u # Matches one or more non-characters that should be removed from HTML before # parsing, as defined by the HTML living standard. # # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream # - https://infra.spec.whatwg.org/#noncharacter REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u # Matches an attribute value that could be treated by a browser as a URL with # a protocol prefix, such as "http:" or "javascript:". Any string of zero or # more characters followed by a colon is considered a match, even if the colon # is encoded as an entity and even if it's an incomplete entity (which IE6 and # Opera will still parse). REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?::|�*58|�*3a)/i # Matches one or more characters that should be stripped from HTML before # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and # `REGEX_HTML_NON_CHARACTERS`. # # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u #-- # Class Methods #++ # Returns a sanitized copy of the given full _html_ document, using the # settings in _config_ if specified. # # When sanitizing a document, the `` element must be allowlisted or an # error will be raised. If this is undesirable, you should probably use # {#fragment} instead. def self.document(html, config = {}) Sanitize.new(config).document(html) end # Returns a sanitized copy of the given _html_ fragment, using the settings in # _config_ if specified. def self.fragment(html, config = {}) Sanitize.new(config).fragment(html) end # Sanitizes the given `Nokogiri::XML::Node` instance and all its children. def self.node!(node, config = {}) Sanitize.new(config).node!(node) end # Aliases for pre-3.0.0 backcompat. class << Sanitize # @deprecated Use {.document} instead. alias_method :clean_document, :document # @deprecated Use {.fragment} instead. alias_method :clean, :fragment # @deprecated Use {.node!} instead. alias_method :clean_node!, :node! end #-- # Instance Methods #++ # Returns a new Sanitize object initialized with the settings in _config_. def initialize(config = {}) @config = Config.merge(Config::DEFAULT, config) @transformers = Array(@config[:transformers]).dup # Default transformers always run at the end of the chain, after any custom # transformers. @transformers << Transformers::CleanElement.new(@config) @transformers << Transformers::CleanComment unless @config[:allow_comments] if @config[:elements].include?("style") scss = Sanitize::CSS.new(config) @transformers << Transformers::CSS::CleanElement.new(scss) end if @config[:attributes].values.any? { |attr| attr.include?("style") } scss ||= Sanitize::CSS.new(config) @transformers << Transformers::CSS::CleanAttribute.new(scss) end @transformers << Transformers::CleanDoctype @transformers << Transformers::CleanCDATA @transformer_config = {config: @config} end # Returns a sanitized copy of the given _html_ document. # # When sanitizing a document, the `` element must be allowlisted or an # error will be raised. If this is undesirable, you should probably use # {#fragment} instead. def document(html) return "" unless html doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options]) node!(doc) to_html(doc) end # @deprecated Use {#document} instead. alias_method :clean_document, :document # Returns a sanitized copy of the given _html_ fragment. def fragment(html) return "" unless html frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options]) node!(frag) to_html(frag) end # @deprecated Use {#fragment} instead. alias_method :clean, :fragment # Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it # in place. # # If _node_ is a `Nokogiri::XML::Document`, the `` element must be # allowlisted or an error will be raised. def node!(node) raise ArgumentError unless node.is_a?(Nokogiri::XML::Node) if node.is_a?(Nokogiri::XML::Document) unless @config[:elements].include?("html") raise Error, 'When sanitizing a document, "" must be allowlisted.' end end node_allowlist = Set.new traverse(node) do |n| transform_node!(n, node_allowlist) end node end # @deprecated Use {#node!} instead. alias_method :clean_node!, :node! private # Preprocesses HTML before parsing to remove undesirable Unicode chars. def preprocess(html) html = html.to_s.dup unless html.encoding.name == "UTF-8" html.encode!("UTF-8", invalid: :replace, undef: :replace) end html.gsub!(REGEX_UNSUITABLE_CHARS, "") html end def to_html(node) node.to_html(preserve_newline: true) end def transform_node!(node, node_allowlist) @transformers.each do |transformer| # Since transform_node! may be called in a tight loop to process thousands # of items, we can optimize both memory and CPU performance by: # # 1. Reusing the same config hash for each transformer # 2. Directly assigning values to hash instead of using merge!. Not only # does merge! create a new hash, it is also 2.6x slower: # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code config = @transformer_config config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node) config[:node] = node config[:node_name] = node.name.downcase config[:node_allowlist] = config[:node_whitelist] = node_allowlist result = transformer.call(**config) if result.is_a?(Hash) result_allowlist = result[:node_allowlist] || result[:node_whitelist] if result_allowlist.respond_to?(:each) node_allowlist.merge(result_allowlist) end end end node end # Performs top-down traversal of the given node, operating first on the node # itself, then traversing each child (if any) in order. def traverse(node, &block) yield node child = node.child while child prev = child.previous_sibling traverse(child, &block) child = if child.parent == node child.next_sibling else # The child was unlinked or reparented, so traverse the previous node's # next sibling, or the parent's first child if there is no previous # node. prev ? prev.next_sibling : node.child end end end class Error < StandardError; end end sanitize-7.0.0/lib/sanitize/0000755000004100000410000000000014744072357016005 5ustar www-datawww-datasanitize-7.0.0/lib/sanitize/config.rb0000644000004100000410000000333414744072357017602 0ustar www-datawww-data# frozen_string_literal: true require "set" class Sanitize module Config # Deeply freezes and returns the given configuration Hash. def self.freeze_config(config) if Hash === config config.each_value { |c| freeze_config(c) } elsif Array === config || Set === config config.each { |c| freeze_config(c) } end config.freeze end # Returns a new Hash containing the result of deeply merging *other_config* # into *config*. Does not modify *config* or *other_config*. # # This is the safest way to use a built-in Sanitize config as the basis for # your own custom config. def self.merge(config, other_config = {}) raise ArgumentError, "config must be a Hash" unless Hash === config raise ArgumentError, "other_config must be a Hash" unless Hash === other_config merged = {} keys = Set.new(config.keys + other_config.keys) keys.each do |key| oldval = config[key] if other_config.has_key?(key) newval = other_config[key] merged[key] = if Hash === oldval && Hash === newval oldval.empty? ? newval.dup : merge(oldval, newval) elsif Array === newval && key != :transformers Set.new(newval) else can_dupe?(newval) ? newval.dup : newval end else merged[key] = can_dupe?(oldval) ? oldval.dup : oldval end end merged end # Returns `true` if `dup` may be safely called on _value_, `false` # otherwise. def self.can_dupe?(value) !(value == true || value == false || value.nil? || Method === value || Numeric === value || Symbol === value) end private_class_method :can_dupe? end end sanitize-7.0.0/lib/sanitize/version.rb0000644000004100000410000000010614744072357020014 0ustar www-datawww-data# frozen_string_literal: true class Sanitize VERSION = "7.0.0" end sanitize-7.0.0/lib/sanitize/css.rb0000644000004100000410000002474114744072357017132 0ustar www-datawww-data# frozen_string_literal: true require "crass" require "set" class Sanitize class CSS attr_reader :config # -- Class Methods --------------------------------------------------------- # Sanitizes inline CSS style properties. # # This is most useful for sanitizing non-stylesheet fragments of CSS like # you would find in the `style` attribute of an HTML element. To sanitize a # full CSS stylesheet, use {.stylesheet}. # # @example # Sanitize::CSS.properties("background: url(foo.png); color: #fff;") # # @return [String] Sanitized CSS properties. def self.properties(css, config = {}) new(config).properties(css) end # Sanitizes a full CSS stylesheet. # # A stylesheet may include selectors, at-rules, and comments. To sanitize # only inline style properties such as the contents of an HTML `style` # attribute, use {.properties}. # # @example # css = %[ # .foo { # background: url(foo.png); # color: #fff; # } # # #bar { # font: 42pt 'Comic Sans MS'; # } # ] # # Sanitize::CSS.stylesheet(css, Sanitize::Config::RELAXED) # # @return [String] Sanitized CSS stylesheet. def self.stylesheet(css, config = {}) new(config).stylesheet(css) end # Sanitizes the given Crass CSS parse tree and all its children, modifying # it in place. # # @example # css = %[ # .foo { # background: url(foo.png); # color: #fff; # } # # #bar { # font: 42pt 'Comic Sans MS'; # } # ] # # tree = Crass.parse(css) # Sanitize::CSS.tree!(tree, Sanitize::Config::RELAXED) # # @return [Array] Sanitized Crass CSS parse tree. def self.tree!(tree, config = {}) new(config).tree!(tree) end # -- Instance Methods ------------------------------------------------------ # Returns a new Sanitize::CSS object initialized with the settings in # _config_. def initialize(config = {}) @config = Config.merge(Config::DEFAULT[:css], config[:css] || config) @at_rules = Set.new(@config[:at_rules]) @at_rules_with_properties = Set.new(@config[:at_rules_with_properties]) @at_rules_with_styles = Set.new(@config[:at_rules_with_styles]) @import_url_validator = @config[:import_url_validator] end # Sanitizes inline CSS style properties. # # This is most useful for sanitizing non-stylesheet fragments of CSS like # you would find in the `style` attribute of an HTML element. To sanitize a # full CSS stylesheet, use {#stylesheet}. # # @example # scss = Sanitize::CSS.new(Sanitize::Config::RELAXED) # scss.properties("background: url(foo.png); color: #fff;") # # @return [String] Sanitized CSS properties. def properties(css) tree = Crass.parse_properties(css, preserve_comments: @config[:allow_comments], preserve_hacks: @config[:allow_hacks]) tree!(tree) Crass::Parser.stringify(tree) end # Sanitizes a full CSS stylesheet. # # A stylesheet may include selectors, at-rules, and comments. To sanitize # only inline style properties such as the contents of an HTML `style` # attribute, use {#properties}. # # @example # css = %[ # .foo { # background: url(foo.png); # color: #fff; # } # # #bar { # font: 42pt 'Comic Sans MS'; # } # ] # # scss = Sanitize::CSS.new(Sanitize::Config::RELAXED) # scss.stylesheet(css) # # @return [String] Sanitized CSS stylesheet. def stylesheet(css) tree = Crass.parse(css, preserve_comments: @config[:allow_comments], preserve_hacks: @config[:allow_hacks]) tree!(tree) Crass::Parser.stringify(tree) end # Sanitizes the given Crass CSS parse tree and all its children, modifying # it in place. # # @example # css = %[ # .foo { # background: url(foo.png); # color: #fff; # } # # #bar { # font: 42pt 'Comic Sans MS'; # } # ] # # scss = Sanitize::CSS.new(Sanitize::Config::RELAXED) # tree = Crass.parse(css) # # scss.tree!(tree) # # @return [Array] Sanitized Crass CSS parse tree. def tree!(tree) preceded_by_property = false tree.map! do |node| next nil if node.nil? case node[:node] when :at_rule preceded_by_property = false next at_rule!(node) when :comment next node if @config[:allow_comments] when :property prop = property!(node) preceded_by_property = !prop.nil? next prop when :semicolon # Only preserve the semicolon if it was preceded by an allowlisted # property. Otherwise, omit it in order to prevent redundant # semicolons. if preceded_by_property preceded_by_property = false next node end when :style_rule preceded_by_property = false tree!(node[:children]) next node when :whitespace next node end nil end tree end # -- Protected Instance Methods -------------------------------------------- protected # Sanitizes a CSS at-rule node. Returns the sanitized node, or `nil` if the # current config doesn't allow this at-rule. def at_rule!(rule) name = rule[:name].downcase if @at_rules_with_styles.include?(name) styles = Crass::Parser.parse_rules(rule[:block], preserve_comments: @config[:allow_comments], preserve_hacks: @config[:allow_hacks]) rule[:block] = tree!(styles) elsif @at_rules_with_properties.include?(name) props = Crass::Parser.parse_properties(rule[:block], preserve_comments: @config[:allow_comments], preserve_hacks: @config[:allow_hacks]) rule[:block] = tree!(props) elsif @at_rules.include?(name) return nil if name == "import" && !import_url_allowed?(rule) return nil if rule.has_key?(:block) else return nil end rule end # Returns `true` if the given CSS function name is an image-related function # that may contain image URLs that need to be validated. def image_function?(name) ["image", "image-set", "-webkit-image-set"].include?(name) end # Passes the URL value of an @import rule to a block to ensure # it's an allowed URL def import_url_allowed?(rule) return true unless @import_url_validator url_token = rule[:tokens].detect { |t| t[:node] == :url || t[:node] == :string } # don't allow @imports with no URL value return false unless url_token && (import_url = url_token[:value]) @import_url_validator.call(import_url) end # Sanitizes a CSS property node. Returns the sanitized node, or `nil` if the # current config doesn't allow this property. def property!(prop) name = prop[:name].downcase # Preserve IE * and _ hacks if desired. if @config[:allow_hacks] name.slice!(0) if /\A[*_]/.match?(name) end return nil unless @config[:properties].include?(name) nodes = prop[:children].dup combined_value = +"" nodes.each do |child| value = child[:value] case child[:node] when :ident combined_value << value.downcase if String === value when :function if child.key?(:name) name = child[:name].downcase if name == "url" return nil unless valid_url?(child) end if image_function?(name) return nil unless valid_image?(child) end combined_value << name return nil if name == "expression" || combined_value == "expression" end if Array === value nodes.concat(value) elsif String === value lowercase_value = value.downcase combined_value << lowercase_value return nil if lowercase_value == "expression" || combined_value == "expression" end when :url return nil unless valid_url?(child) when :bad_url return nil end end prop end # Returns `true` if the given node (which may be of type `:url` or # `:function`, since the CSS syntax can produce both) uses an allowlisted # protocol. def valid_url?(node) type = node[:node] if type == :function return false unless node.key?(:name) && node[:name].downcase == "url" return false unless Array === node[:value] # A URL function's `:value` should be an array containing no more than # one `:string` node and any number of `:whitespace` nodes. # # If it contains more than one `:string` node, or if it contains any # other nodes except `:whitespace` nodes, it's not valid. url_string_node = nil node[:value].each do |token| return false unless Hash === token case token[:node] when :string return false unless url_string_node.nil? url_string_node = token when :whitespace next else return false end end return false if url_string_node.nil? url = url_string_node[:value] elsif type == :url url = node[:value] else return false end if url =~ Sanitize::REGEX_PROTOCOL @config[:protocols].include?($1.downcase) else @config[:protocols].include?(:relative) end end # Returns `true` if the given node is an image-related function and contains # only strings that use an allowlisted protocol. def valid_image?(node) return false unless node[:node] == :function return false unless node.key?(:name) && image_function?(node[:name].downcase) return false unless Array === node[:value] node[:value].each do |token| return false unless Hash === token case token[:node] when :string if token[:value] =~ Sanitize::REGEX_PROTOCOL return false unless @config[:protocols].include?($1.downcase) else return false unless @config[:protocols].include?(:relative) end else next end end end end end sanitize-7.0.0/lib/sanitize/config/0000755000004100000410000000000014744072357017252 5ustar www-datawww-datasanitize-7.0.0/lib/sanitize/config/default.rb0000644000004100000410000001126514744072357021230 0ustar www-datawww-data# frozen_string_literal: true class Sanitize module Config DEFAULT = freeze_config( # HTML attributes to add to specific elements. By default, no attributes # are added. add_attributes: {}, # Whether or not to allow HTML comments. Allowing comments is strongly # discouraged, since IE allows script execution within conditional # comments. allow_comments: false, # Whether or not to allow well-formed HTML doctype declarations such as # "" when sanitizing a document. This setting is ignored # when sanitizing fragments. allow_doctype: false, # HTML attributes to allow in specific elements. By default, no attributes # are allowed. Use the symbol :data to indicate that arbitrary HTML5 # data-* attributes should be allowed. attributes: {}, # CSS sanitization settings. css: { # Whether or not to allow CSS comments. allow_comments: false, # Whether or not to allow browser compatibility hacks such as the IE * # and _ hacks. These are generally harmless, but technically result in # invalid CSS. allow_hacks: false, # CSS at-rules to allow that may not have associated blocks (e.g. # "import"). # # https://developer.mozilla.org/en-US/docs/Web/CSS/At-rule at_rules: [], # CSS at-rules to allow whose blocks may contain properties (e.g. # "font-face"). at_rules_with_properties: [], # CSS at-rules to allow whose blocks may contain styles (e.g. "media"). at_rules_with_styles: [], # CSS properties to allow. properties: [], # URL protocols to allow in CSS URLs. protocols: [] }, # HTML elements to allow. By default, no elements are allowed (which means # that all HTML will be stripped). # # Warning: Sanitize cannot safely sanitize the contents of foreign # elements (elements in the MathML or SVG namespaces). Do not add `math` # or `svg` to this list! If you do, you may create a security # vulnerability in your application. elements: [], # HTML parsing options to pass to Nokogumbo. # https://github.com/rubys/nokogumbo/tree/v2.0.1#parsing-options parser_options: {}, # URL handling protocols to allow in specific attributes. By default, no # protocols are allowed. Use :relative in place of a protocol if you want # to allow relative URLs sans protocol. protocols: {}, # If this is true, Sanitize will remove the contents of any filtered # elements in addition to the elements themselves. By default, Sanitize # leaves the safe parts of an element's contents behind when the element # is removed. # # If this is an Array or Set of element names, then only the contents of # the specified elements (when filtered) will be removed, and the contents # of all other filtered elements will be left behind. remove_contents: %w[ iframe math noembed noframes noscript plaintext script style svg xmp ], # Transformers allow you to filter or alter nodes using custom logic. See # README.md for details and examples. transformers: [], # Elements which, when removed, should have their contents surrounded by # values specified with `before` and `after` keys to preserve readability. # For example, `foo
bar
baz` will become 'foo bar baz' when the #
is removed. whitespace_elements: { "address" => {before: " ", after: " "}, "article" => {before: " ", after: " "}, "aside" => {before: " ", after: " "}, "blockquote" => {before: " ", after: " "}, "br" => {before: " ", after: " "}, "dd" => {before: " ", after: " "}, "div" => {before: " ", after: " "}, "dl" => {before: " ", after: " "}, "dt" => {before: " ", after: " "}, "footer" => {before: " ", after: " "}, "h1" => {before: " ", after: " "}, "h2" => {before: " ", after: " "}, "h3" => {before: " ", after: " "}, "h4" => {before: " ", after: " "}, "h5" => {before: " ", after: " "}, "h6" => {before: " ", after: " "}, "header" => {before: " ", after: " "}, "hgroup" => {before: " ", after: " "}, "hr" => {before: " ", after: " "}, "li" => {before: " ", after: " "}, "nav" => {before: " ", after: " "}, "ol" => {before: " ", after: " "}, "p" => {before: " ", after: " "}, "pre" => {before: " ", after: " "}, "section" => {before: " ", after: " "}, "ul" => {before: " ", after: " "} } ) end end sanitize-7.0.0/lib/sanitize/config/relaxed.rb0000644000004100000410000005475414744072357021242 0ustar www-datawww-data# frozen_string_literal: true class Sanitize module Config RELAXED = freeze_config( elements: BASIC[:elements] + %w[ address article aside bdi bdo body caption col colgroup data del div figcaption figure footer h1 h2 h3 h4 h5 h6 head header hgroup hr html img ins main nav rp rt ruby section span style summary table tbody td tfoot th thead title tr wbr ], allow_doctype: true, attributes: merge(BASIC[:attributes], :all => %w[class dir hidden id lang style tabindex title translate], "a" => %w[href hreflang name rel], "col" => %w[span width], "colgroup" => %w[span width], "data" => %w[value], "del" => %w[cite datetime], "img" => %w[align alt border height src srcset width], "ins" => %w[cite datetime], "li" => %w[value], "ol" => %w[reversed start type], "style" => %w[media scoped type], "table" => %w[align bgcolor border cellpadding cellspacing frame rules sortable summary width], "td" => %w[abbr align axis colspan headers rowspan valign width], "th" => %w[abbr align axis colspan headers rowspan scope sorted valign width], "ul" => %w[type]), protocols: merge(BASIC[:protocols], "del" => {"cite" => ["http", "https", :relative]}, "img" => {"src" => ["http", "https", :relative]}, "ins" => {"cite" => ["http", "https", :relative]}), css: { allow_comments: true, allow_hacks: true, at_rules_with_properties: %w[ bottom-center bottom-left bottom-left-corner bottom-right bottom-right-corner font-face left-bottom left-middle left-top page right-bottom right-middle right-top top-center top-left top-left-corner top-right top-right-corner ], at_rules_with_styles: %w[ -moz-keyframes -o-keyframes -webkit-keyframes container document keyframes media supports ], protocols: ["http", "https", :relative], properties: %w[ -moz-appearance -moz-background-inline-policy -moz-box-sizing -moz-column-count -moz-column-fill -moz-column-gap -moz-column-rule -moz-column-rule-color -moz-column-rule-style -moz-column-rule-width -moz-column-width -moz-font-feature-settings -moz-font-language-override -moz-hyphens -moz-text-align-last -moz-text-decoration-color -moz-text-decoration-line -moz-text-decoration-style -moz-text-size-adjust -ms-background-position-x -ms-background-position-y -ms-block-progression -ms-content-zoom-chaining -ms-content-zoom-limit -ms-content-zoom-limit-max -ms-content-zoom-limit-min -ms-content-zoom-snap -ms-content-zoom-snap-points -ms-content-zoom-snap-type -ms-content-zooming -ms-filter -ms-flex -ms-flex-align -ms-flex-direction -ms-flex-order -ms-flex-pack -ms-flex-wrap -ms-flow-from -ms-flow-into -ms-grid-column -ms-grid-column-align -ms-grid-column-span -ms-grid-columns -ms-grid-row -ms-grid-row-align -ms-grid-row-span -ms-grid-rows -ms-high-contrast-adjust -ms-hyphenate-limit-chars -ms-hyphenate-limit-lines -ms-hyphenate-limit-zone -ms-hyphens -ms-ime-mode -ms-interpolation-mode -ms-layout-flow -ms-layout-grid -ms-layout-grid-char -ms-layout-grid-line -ms-layout-grid-mode -ms-layout-grid-type -ms-overflow-style -ms-overflow-x -ms-overflow-y -ms-progress-appearance -ms-scroll-chaining -ms-scroll-limit -ms-scroll-limit-x-max -ms-scroll-limit-x-min -ms-scroll-limit-y-max -ms-scroll-limit-y-min -ms-scroll-rails -ms-scroll-snap-points-x -ms-scroll-snap-points-y -ms-scroll-snap-type -ms-scroll-snap-x -ms-scroll-snap-y -ms-scroll-translation -ms-scrollbar-arrow-color -ms-scrollbar-base-color -ms-scrollbar-darkshadow-color -ms-scrollbar-face-color -ms-scrollbar-highlight-color -ms-scrollbar-shadow-color -ms-scrollbar-track-color -ms-text-align-last -ms-text-autospace -ms-text-justify -ms-text-kashida-space -ms-text-overflow -ms-text-size-adjust -ms-text-underline-position -ms-touch-action -ms-user-select -ms-word-break -ms-word-wrap -ms-wrap-flow -ms-wrap-margin -ms-wrap-through -ms-writing-mode -ms-zoom -webkit-align-content -webkit-align-items -webkit-align-self -webkit-animation -webkit-animation-delay -webkit-animation-direction -webkit-animation-duration -webkit-animation-fill-mode -webkit-animation-iteration-count -webkit-animation-name -webkit-animation-play-state -webkit-animation-timing-function -webkit-appearance -webkit-backface-visibility -webkit-background-blend-mode -webkit-background-clip -webkit-background-composite -webkit-background-origin -webkit-background-size -webkit-blend-mode -webkit-border-after -webkit-border-after-color -webkit-border-after-style -webkit-border-after-width -webkit-border-before -webkit-border-before-color -webkit-border-before-style -webkit-border-before-width -webkit-border-bottom-left-radius -webkit-border-bottom-right-radius -webkit-border-end -webkit-border-end-color -webkit-border-end-style -webkit-border-end-width -webkit-border-fit -webkit-border-image -webkit-border-radius -webkit-border-start -webkit-border-start-color -webkit-border-start-style -webkit-border-start-width -webkit-border-top-left-radius -webkit-border-top-right-radius -webkit-box-align -webkit-box-decoration-break -webkit-box-flex -webkit-box-flex-group -webkit-box-lines -webkit-box-ordinal-group -webkit-box-orient -webkit-box-pack -webkit-box-reflect -webkit-box-shadow -webkit-box-sizing -webkit-clip-path -webkit-column-axis -webkit-column-break-after -webkit-column-break-before -webkit-column-break-inside -webkit-column-count -webkit-column-gap -webkit-column-progression -webkit-column-rule -webkit-column-rule-color -webkit-column-rule-style -webkit-column-rule-width -webkit-column-span -webkit-column-width -webkit-columns -webkit-filter -webkit-flex -webkit-flex-basis -webkit-flex-direction -webkit-flex-flow -webkit-flex-grow -webkit-flex-shrink -webkit-flex-wrap -webkit-flow-from -webkit-flow-into -webkit-font-size-delta -webkit-font-smoothing -webkit-grid-area -webkit-grid-auto-columns -webkit-grid-auto-flow -webkit-grid-auto-rows -webkit-grid-column -webkit-grid-column-end -webkit-grid-column-start -webkit-grid-definition-columns -webkit-grid-definition-rows -webkit-grid-row -webkit-grid-row-end -webkit-grid-row-start -webkit-justify-content -webkit-line-clamp -webkit-logical-height -webkit-logical-width -webkit-margin-after -webkit-margin-after-collapse -webkit-margin-before -webkit-margin-before-collapse -webkit-margin-bottom-collapse -webkit-margin-collapse -webkit-margin-end -webkit-margin-start -webkit-margin-top-collapse -webkit-marquee -webkit-marquee-direction -webkit-marquee-increment -webkit-marquee-repetition -webkit-marquee-speed -webkit-marquee-style -webkit-mask -webkit-mask-box-image -webkit-mask-box-image-outset -webkit-mask-box-image-repeat -webkit-mask-box-image-slice -webkit-mask-box-image-source -webkit-mask-box-image-width -webkit-mask-clip -webkit-mask-composite -webkit-mask-image -webkit-mask-origin -webkit-mask-position -webkit-mask-position-x -webkit-mask-position-y -webkit-mask-repeat -webkit-mask-repeat-x -webkit-mask-repeat-y -webkit-mask-size -webkit-mask-source-type -webkit-max-logical-height -webkit-max-logical-width -webkit-min-logical-height -webkit-min-logical-width -webkit-opacity -webkit-order -webkit-padding-after -webkit-padding-before -webkit-padding-end -webkit-padding-start -webkit-perspective -webkit-perspective-origin -webkit-perspective-origin-x -webkit-perspective-origin-y -webkit-region-break-after -webkit-region-break-before -webkit-region-break-inside -webkit-region-fragment -webkit-shape-inside -webkit-shape-margin -webkit-shape-outside -webkit-shape-padding -webkit-svg-shadow -webkit-tap-highlight-color -webkit-text-decoration -webkit-text-decoration-color -webkit-text-decoration-line -webkit-text-decoration-style -webkit-text-fill-color -webkit-text-size-adjust -webkit-touch-callout -webkit-transform -webkit-transform-origin -webkit-transform-origin-x -webkit-transform-origin-y -webkit-transform-origin-z -webkit-transform-style -webkit-transition -webkit-transition-delay -webkit-transition-duration -webkit-transition-property -webkit-transition-timing-function -webkit-user-drag -webkit-wrap-flow -webkit-wrap-through accent-color align-content align-items align-self alignment-adjust alignment-baseline all anchor-point anchor-name anchor-scope animation animation-composition animation-delay animation-direction animation-duration animation-fill-mode animation-iteration-count animation-name animation-play-state animation-range animation-range-end animation-range-start animation-timeline animation-timing-function appearance aspect-ratio azimuth backface-visibility background background-attachment background-blend-mode background-clip background-color background-image background-origin background-position background-repeat background-size baseline-shift binding bleed baseline-source block-ellipsis block-size bookmark-label bookmark-level bookmark-state border border-block border-block-color border-block-end border-block-end-color border-block-end-style border-block-end-width border-block-start border-block-start-color border-block-start-style border-block-start-width border-block-style border-block-width border-bottom border-bottom-color border-bottom-left-radius border-bottom-right-radius border-bottom-style border-bottom-width border-boundary border-collapse border-color border-end-end-radius border-end-start-radius border-image border-image-outset border-image-repeat border-image-slice border-image-source border-image-width border-inline border-inline-color border-inline-end border-inline-end-color border-inline-end-style border-inline-end-width border-inline-start border-inline-start-color border-inline-start-style border-inline-start-width border-inline-style border-inline-width border-left border-left-color border-left-style border-left-width border-radius border-right border-right-color border-right-style border-right-width border-spacing border-start-end-radius border-start-start-radius border-style border-top border-top-color border-top-left-radius border-top-right-radius border-top-style border-top-width border-width bottom box-decoration-break box-shadow box-sizing box-snap box-suppress break-after break-before break-inside caption-side chains caret caret-color caret-shape clear clip clip-path clip-rule color color-interpolation color-adjust color-interpolation-filters color-profile color-rendering color-scheme column-count column-fill column-gap column-rule column-rule-color column-rule-style column-rule-width column-span column-width columns contain contain-intrinsic-block-size contain-intrinsic-height contain-intrinsic-inline-size contain-intrinsic-size contain-intrinsic-width container container-name container-type content content-visibility continue counter-increment counter-reset counter-set crop cue cue-after cue-before cursor direction display display-inside display-list display-outside dominant-baseline elevation empty-cells enable-background fill fill-opacity fill-rule filter flex flex-basis flex-direction flex-flow flex-grow flex-shrink flex-wrap float float-offset flood-color flood-opacity flow-from flow-into font font-family font-feature-settings font-kerning font-language-override font-optical-sizing font-palette font-size font-size-adjust font-stretch font-style font-synthesis font-synthesis-position font-synthesis-small-caps font-synthesis-style font-synthesis-weight font-variant font-variant-alternates font-variant-caps font-variant-east-asian font-variant-emoji font-variant-ligatures font-variant-numeric font-variant-position font-variation-settings font-weight glyph-orientation-horizontal font-width footnote-display footnote-policy forced-color-adjust gap glyph-orientation-vertical grid grid-area grid-auto-columns grid-auto-flow grid-auto-rows grid-column grid-column-end grid-column-start grid-row grid-row-end grid-row-start grid-template grid-template-areas grid-template-columns grid-template-rows hanging-punctuation height hyphens icon image-orientation image-rendering image-resolution ime-mode initial-letters inline-box-align justify-content justify-items justify-self kerning left letter-spacing lighting-color line-box-contain line-break line-grid line-height line-snap line-stacking line-stacking-ruby line-stacking-shift line-stacking-strategy list-style list-style-image list-style-position list-style-type margin margin-bottom margin-left margin-right margin-top marker marker-end marker-mid marker-offset marker-side marker-start marks mask mask-box mask-box-outset mask-box-repeat mask-box-slice mask-box-source mask-box-width mask-clip mask-image mask-origin mask-position mask-repeat mask-size mask-source-type mask-type max-height max-lines max-width min-height min-width move-to nav-down nav-index nav-left nav-right nav-up object-fit object-position opacity order orphans outline outline-color outline-offset outline-style outline-width overflow overflow-wrap overflow-x overflow-y padding padding-bottom padding-left padding-right padding-top page page-break-after page-break-before page-break-inside page-policy pause pause-after pause-before perspective perspective-origin pitch pitch-range play-during pointer-events position presentation-level quotes region-fragment resize rest rest-after rest-before richness right rotation rotation-point ruby-align ruby-merge ruby-position shape-image-threshold shape-margin shape-outside shape-rendering size speak speak-as speak-header speak-numeral speak-punctuation speech-rate stop-color stop-opacity stress string-set stroke stroke-dasharray stroke-dashoffset stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity stroke-width tab-size table-layout text-align text-align-all text-align-last text-anchor text-combine-horizontal text-autospace text-box text-box-edge text-box-trim text-combine-upright text-decoration text-decoration-color text-decoration-line text-decoration-skip text-decoration-skip-box text-decoration-skip-ink text-decoration-skip-inset text-decoration-skip-self text-decoration-skip-spaces text-decoration-style text-decoration-thickness text-emphasis text-emphasis-color text-emphasis-position text-emphasis-skip text-emphasis-style text-height text-group-align text-indent text-justify text-orientation text-overflow text-rendering text-shadow text-size-adjust text-space-collapse text-spacing text-spacing-trim text-transform text-underline-offset text-underline-position text-wrap text-wrap-mode text-wrap-style timeline-scope top touch-action transform transform-box transform-origin transform-style transition transition-delay transition-duration transition-property transition-timing-function translate unicode-bidi unicode-range user-select vertical-align view-timeline view-timeline-axis view-timeline-inset view-timeline-name view-transition-class view-transition-group view-transition-name visibility voice-balance voice-duration voice-family voice-pitch voice-range voice-rate voice-stress voice-volume volume white-space white-space-collapse white-space-trim widows width will-change word-break word-space-transform word-spacing word-wrap wrap-after wrap-before wrap-flow wrap-inside wrap-through writing-mode z-index ] } ) end end sanitize-7.0.0/lib/sanitize/config/basic.rb0000644000004100000410000000142214744072357020657 0ustar www-datawww-data# frozen_string_literal: true class Sanitize module Config BASIC = freeze_config( elements: RESTRICTED[:elements] + %w[ a abbr blockquote br cite code dd dfn dl dt kbd li mark ol p pre q s samp small strike sub sup time ul var ], attributes: { "a" => %w[href], "abbr" => %w[title], "blockquote" => %w[cite], "dfn" => %w[title], "q" => %w[cite], "time" => %w[datetime pubdate] }, add_attributes: { "a" => {"rel" => "nofollow"} }, protocols: { "a" => {"href" => ["ftp", "http", "https", "mailto", :relative]}, "blockquote" => {"cite" => ["http", "https", :relative]}, "q" => {"cite" => ["http", "https", :relative]} } ) end end sanitize-7.0.0/lib/sanitize/config/restricted.rb0000644000004100000410000000022214744072357021743 0ustar www-datawww-data# frozen_string_literal: true class Sanitize module Config RESTRICTED = freeze_config( elements: %w[b em i strong u] ) end end sanitize-7.0.0/lib/sanitize/transformers/0000755000004100000410000000000014744072357020532 5ustar www-datawww-datasanitize-7.0.0/lib/sanitize/transformers/clean_comment.rb0000644000004100000410000000040414744072357023661 0ustar www-datawww-data# frozen_string_literal: true class Sanitize module Transformers CleanComment = lambda do |env| node = env[:node] if node.type == Nokogiri::XML::Node::COMMENT_NODE node.unlink unless env[:is_allowlisted] end end end end sanitize-7.0.0/lib/sanitize/transformers/clean_element.rb0000644000004100000410000002311614744072357023655 0ustar www-datawww-data# frozen_string_literal: true require "cgi" require "set" class Sanitize module Transformers class CleanElement # Matches a valid HTML5 data attribute name. The unicode ranges included # here are a conservative subset of the full range of characters that are # technically allowed, with the intent of matching the most common # characters used in data attribute names while excluding uncommon or # potentially misleading characters, or characters with the potential to # be normalized into unsafe or confusing forms. # # If you need data attr names with characters that aren't included here # (such as combining marks, full-width characters, or CJK), please # consider creating a custom transformer to validate attributes according # to your needs. # # https://html.spec.whatwg.org/multipage/dom.html#embedding-custom-non-visible-data-with-the-data-*-attributes REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u # Elements whose content is treated as unescaped text by HTML parsers. UNESCAPED_TEXT_ELEMENTS = Set.new(%w[ iframe noembed noframes noscript plaintext script style xmp ]) # Attributes that need additional escaping on `` elements due to unsafe # libxml2 behavior. UNSAFE_LIBXML_ATTRS_A = Set.new(%w[ name ]) # Attributes that need additional escaping on all elements due to unsafe # libxml2 behavior. UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[ action href src ]) # Mapping of original characters to escape sequences for characters that # should be escaped in attributes affected by unsafe libxml2 behavior. UNSAFE_LIBXML_ESCAPE_CHARS = { " " => "%20", '"' => "%22" } # Regex that matches any single character that needs to be escaped in # attributes affected by unsafe libxml2 behavior. UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/ def initialize(config) @add_attributes = config[:add_attributes] @attributes = config[:attributes].dup @elements = config[:elements] @protocols = config[:protocols] @remove_all_contents = false @remove_element_contents = Set.new @whitespace_elements = {} @attributes.each do |element_name, attrs| unless element_name == :all @attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || []) end end # Backcompat: if :whitespace_elements is a Set, convert it to a hash. if config[:whitespace_elements].is_a?(Set) config[:whitespace_elements].each do |element| @whitespace_elements[element] = {before: " ", after: " "} end else @whitespace_elements = config[:whitespace_elements] end if config[:remove_contents].is_a?(Enumerable) @remove_element_contents.merge(config[:remove_contents].map(&:to_s)) else @remove_all_contents = !!config[:remove_contents] end end def call(env) node = env[:node] return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted] name = env[:node_name] # Delete any element that isn't in the config allowlist, unless the node # has already been deleted from the document. # # It's important that we not try to reparent the children of a node that # has already been deleted, since that seems to trigger a memory leak in # Nokogiri. unless @elements.include?(name) || node.parent.nil? # Elements like br, div, p, etc. need to be replaced with whitespace # in order to preserve readability. if @whitespace_elements.include?(name) node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document)) unless node.children.empty? node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document)) end end unless node.children.empty? unless @remove_all_contents || @remove_element_contents.include?(name) node.add_previous_sibling(node.children) end end node.unlink return end attr_allowlist = @attributes[name] || @attributes[:all] if attr_allowlist.nil? # Delete all attributes from elements with no allowlisted attributes. node.attribute_nodes.each { |attr| attr.unlink } else allow_data_attributes = attr_allowlist.include?(:data) # Delete any attribute that isn't allowed on this element. node.attribute_nodes.each do |attr| attr_name = attr.name.downcase unless attr_allowlist.include?(attr_name) # The attribute isn't in the allowlist, but may still be allowed # if it's a data attribute. unless allow_data_attributes && attr_name.start_with?("data-") && attr_name =~ REGEX_DATA_ATTR # Either the attribute isn't a data attribute or arbitrary data # attributes aren't allowed. Remove the attribute. attr.unlink next end end # The attribute is allowed. # Remove any attributes that use unacceptable protocols. if @protocols.include?(name) && @protocols[name].include?(attr_name) attr_protocols = @protocols[name][attr_name] if attr.value =~ REGEX_PROTOCOL unless attr_protocols.include?($1.downcase) attr.unlink next end else unless attr_protocols.include?(:relative) attr.unlink next end end # Leading and trailing whitespace around URLs is ignored at parse # time. Stripping it here prevents it from being escaped by the # libxml2 workaround below. attr.value = attr.value.strip end # libxml2 >= 2.9.2 doesn't escape comments within some attributes, # in an attempt to preserve server-side includes. This can result in # XSS since an unescaped double quote can allow an attacker to # inject a non-allowlisted attribute. # # Sanitize works around this by implementing its own escaping for # affected attributes, some of which can exist on any element and # some of which can only exist on `` elements. # # This fix is technically no longer necessary with Nokogumbo >= 2.0 # since it no longer uses libxml2's serializer, but it's retained to # avoid breaking use cases where people might be sanitizing # individual Nokogiri nodes and then serializing them manually # without Nokogumbo. # # The relevant libxml2 code is here: # if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) || (name == "a" && UNSAFE_LIBXML_ATTRS_A.include?(attr_name)) attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS) end end end # Add required attributes. if @add_attributes.include?(name) @add_attributes[name].each { |key, val| node[key] = val } end # Element-specific special cases. case name # If this is an allowlisted iframe that has children, remove all its # children. The HTML standard says iframes shouldn't have content, but # when they do, this content is parsed as text and is serialized # verbatim without being escaped, which is unsafe because legacy # browsers may still render it and execute `")).must_equal "" _(Sanitize.fragment("", allow_comments: false, elements: ["script"])) .must_equal "" end end describe "when :allow_comments is true" do before do @s = Sanitize.new(allow_comments: true, elements: ["div"]) end it "should allow comments" do _(@s.fragment("foo bar")).must_equal "foo bar" _(@s.fragment("foo " _(@s.fragment("foo " _(@s.fragment("foo bar")).must_equal "foo bar" _(@s.fragment("foo --> -->bar")).must_equal "foo --> -->bar" _(@s.fragment("foo
>bar
")).must_equal "foo
>bar
" _(Sanitize.fragment("", allow_comments: true, elements: ["script"])) .must_equal "" end end end sanitize-7.0.0/test/test_sanitize.rb0000644000004100000410000001727314744072357017614 0ustar www-datawww-data# frozen_string_literal: true require_relative "common" describe "Sanitize" do describe "initializer" do it "should not modify a transformers array in the given config" do transformers = [ lambda {} ] Sanitize.new({transformers: transformers}) _(transformers.length).must_equal(1) end end describe "instance methods" do before do @s = Sanitize.new end describe "#document" do before do @s = Sanitize.new(elements: ["html"]) end it "should sanitize an HTML document" do _(@s.document('Lorem
ipsum dolor sit
amet ')) .must_equal "Lorem ipsum dolor sit amet " end it "should not modify the input string" do input = "foo" @s.document(input) _(input).must_equal("foo") end it "should not choke on frozen documents" do _(@s.document("foo")).must_equal "foo" end it "should normalize newlines" do _(@s.document("a\r\n\n\r\r\r\nz")).must_equal "a\n\n\n\n\nz" end it "should strip control characters (except ASCII whitespace)" do sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f" whitespace = "\t\n\f\u0020" _(@s.document("a#{sample_control_chars}#{whitespace}z")).must_equal "a#{whitespace}z" end it "should strip non-characters" do sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}" _(@s.document("a#{sample_non_chars}z")).must_equal "az" end describe "when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH" do let(:content) do content = nest_html_content("foo", Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH) "#{content}" end it "raises an ArgumentError exception" do assert_raises ArgumentError do @s.document(content) end end describe "and :max_tree_depth of -1 is supplied in :parser_options" do before do @s = Sanitize.new(elements: ["html"], parser_options: {max_tree_depth: -1}) end it "does not raise an ArgumentError exception" do _(@s.document(content)).must_equal "foo" end end end end describe "#fragment" do it "should sanitize an HTML fragment" do _(@s.fragment('Lorem ipsum dolor sit
amet ')) .must_equal "Lorem ipsum dolor sit amet " end it "should not modify the input string" do input = "foo" @s.fragment(input) _(input).must_equal "foo" end it "should not choke on fragments containing or " do _(@s.fragment("foo")).must_equal "foo" _(@s.fragment("foo")).must_equal "foo" _(@s.fragment("foo")).must_equal "foo" _(@s.fragment("foo")).must_equal "foo" end it "should not choke on frozen fragments" do _(@s.fragment("foo")).must_equal "foo" end it "should normalize newlines" do _(@s.fragment("a\r\n\n\r\r\r\nz")).must_equal "a\n\n\n\n\nz" end it "should strip control characters (except ASCII whitespace)" do sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f" whitespace = "\t\n\f\u0020" _(@s.fragment("a#{sample_control_chars}#{whitespace}z")).must_equal "a#{whitespace}z" end it "should strip non-characters" do sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}" _(@s.fragment("a#{sample_non_chars}z")).must_equal "az" end describe "when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH" do let(:content) do content = nest_html_content("foo", Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH) "#{content}" end it "raises an ArgumentError exception" do assert_raises ArgumentError do @s.fragment(content) end end describe "and :max_tree_depth of -1 is supplied in :parser_options" do before do @s = Sanitize.new(parser_options: {max_tree_depth: -1}) end it "does not raise an ArgumentError exception" do _(@s.fragment(content)).must_equal "foo" end end end end describe "#node!" do it "should sanitize a Nokogiri::XML::Node" do doc = Nokogiri::HTML5.parse('Lorem ipsum dolor sit
amet ') frag = doc.fragment doc.xpath("/html/body/node()").each { |node| frag << node } @s.node!(frag) _(frag.to_html).must_equal "Lorem ipsum dolor sit amet " end describe "when the given node is a document and isn't allowlisted" do it "should raise a Sanitize::Error" do doc = Nokogiri::HTML5.parse("foo") _(proc { @s.node!(doc) }).must_raise Sanitize::Error end end end end describe "class methods" do describe ".document" do it "should sanitize an HTML document with the given config" do html = 'Lorem ipsum dolor sit
amet ' _(Sanitize.document(html, elements: ["html"])) .must_equal "Lorem ipsum dolor sit amet " end end describe ".fragment" do it "should sanitize an HTML fragment with the given config" do html = 'Lorem ipsum dolor sit
amet ' _(Sanitize.fragment(html, elements: ["strong"])) .must_equal "Lorem ipsum dolor sit amet " end end describe ".node!" do it "should sanitize a Nokogiri::XML::Node with the given config" do doc = Nokogiri::HTML5.parse('Lorem ipsum dolor sit
amet ') frag = doc.fragment doc.xpath("/html/body/node()").each { |node| frag << node } Sanitize.node!(frag, elements: ["strong"]) _(frag.to_html).must_equal "Lorem ipsum dolor sit amet " end end end private def nest_html_content(html_content, depth) "#{"" * depth}#{html_content}#{"" * depth}" end end sanitize-7.0.0/test/common.rb0000644000004100000410000000011514744072357016202 0ustar www-datawww-data# frozen_string_literal: true require "minitest/autorun" require "sanitize" sanitize-7.0.0/test/test_malicious_css.rb0000644000004100000410000000335214744072357020614 0ustar www-datawww-data# frozen_string_literal: true require_relative "common" # Miscellaneous attempts to sneak maliciously crafted CSS past Sanitize. Some of # these are courtesy of (or inspired by) the OWASP XSS Filter Evasion Cheat # Sheet. # # https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet describe "Malicious CSS" do make_my_diffs_pretty! parallelize_me! before do @s = Sanitize::CSS.new(Sanitize::Config::RELAXED) end it "should not be possible to inject an expression by munging it with a comment" do _(@s.properties(%[width:expr/*XSS*/ession(alert('XSS'))])) .must_equal "" _(@s.properties(%[width:ex/*XSS*//*/*/pression(alert("XSS"))])) .must_equal "" end it "should not be possible to inject an expression by munging it with a newline" do _(@s.properties(%[width:\nexpression(alert('XSS'));])) .must_equal "" end it "should not allow the javascript protocol" do _(@s.properties(%[background-image:url("javascript:alert('XSS')");])) .must_equal "" _(Sanitize.fragment(%[
], Sanitize::Config::RELAXED)).must_equal "
" end it "should not allow behaviors" do _(@s.properties(%[behavior: url(xss.htc);])).must_equal "" end describe "sanitization bypass via CSS at-rule in HTML ], @s.fragment(%[]) ) end end end sanitize-7.0.0/test/test_malicious_html.rb0000644000004100000410000002741314744072357020774 0ustar www-datawww-data# frozen_string_literal: true require_relative "common" # Miscellaneous attempts to sneak maliciously crafted HTML past Sanitize. Many # of these are courtesy of (or inspired by) the OWASP XSS Filter Evasion Cheat # Sheet. # # https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet describe "Malicious HTML" do make_my_diffs_pretty! parallelize_me! before do @s = Sanitize.new(Sanitize::Config::RELAXED) end describe "comments" do it "should not allow script injection via conditional comments" do _(@s.fragment(%[])) .must_equal "" end end describe "interpolation (ERB, PHP, etc.)" do it "should escape ERB-style tags" do _(@s.fragment("<% naughty_ruby_code %>")) .must_equal "<% naughty_ruby_code %>" _(@s.fragment("<%= naughty_ruby_code %>")) .must_equal "<%= naughty_ruby_code %>" end it "should remove PHP-style tags" do _(@s.fragment("")) .must_equal "" _(@s.fragment("")) .must_equal "" end end describe "" do it "should not be possible to inject JS via a malformed event attribute" do _(@s.document('')) .must_equal "" end end describe "' _(Sanitize.fragment(input, transformers: youtube_transformer)) .must_equal '' end it "should allow HTTPS YouTube video embeds" do input = '' _(Sanitize.fragment(input, transformers: youtube_transformer)) .must_equal '' end it "should allow protocol-relative YouTube video embeds" do input = '' _(Sanitize.fragment(input, transformers: youtube_transformer)) .must_equal '' end it "should allow privacy-enhanced YouTube video embeds" do input = '' _(Sanitize.fragment(input, transformers: youtube_transformer)) .must_equal '' end it "should not allow non-YouTube video embeds" do input = '' _(Sanitize.fragment(input, transformers: youtube_transformer)) .must_equal("") end end describe "DOM modification transformer" do b_to_strong_tag_transformer = lambda do |env| node = env[:node] node_name = env[:node_name] if node_name == "b" node.name = "strong" end end it "should allow the tag to be changed to a tag" do input = "text" _(Sanitize.fragment(input, elements: ["strong"], transformers: b_to_strong_tag_transformer)) .must_equal "text" end end end sanitize-7.0.0/test/test_sanitize_css.rb0000644000004100000410000003626414744072357020465 0ustar www-datawww-data# frozen_string_literal: true require_relative "common" describe "Sanitize::CSS" do make_my_diffs_pretty! parallelize_me! describe "instance methods" do before do @default = Sanitize::CSS.new @relaxed = Sanitize::CSS.new(Sanitize::Config::RELAXED[:css]) @custom = Sanitize::CSS.new(properties: %w[background color width]) end describe "#properties" do it "should sanitize CSS properties" do css = 'background: #fff; width: expression(alert("hi"));' _(@default.properties(css)).must_equal " " _(@relaxed.properties(css)).must_equal "background: #fff; " _(@custom.properties(css)).must_equal "background: #fff; " end it "should allow allowlisted URL protocols" do [ "background: url(relative.jpg)", "background: url('relative.jpg')", "background: url(http://example.com/http.jpg)", "background: url('ht\\tp://example.com/http.jpg')", "background: url(https://example.com/https.jpg)", "background: url('https://example.com/https.jpg')", "background: image-set('relative.jpg' 1x, 'relative-2x.jpg' 2x)", "background: image-set('https://example.com/https.jpg' 1x, 'https://example.com/https-2x.jpg' 2x)", "background: image-set('https://example.com/https.jpg' type('image/jpeg'), 'https://example.com/https.avif' type('image/avif'))", "background: -webkit-image-set('relative.jpg' 1x, 'relative-2x.jpg' 2x)", "background: -webkit-image-set('https://example.com/https.jpg' 1x, 'https://example.com/https-2x.jpg' 2x)", "background: -webkit-image-set('https://example.com/https.jpg' type('image/jpeg'), 'https://example.com/https.avif' type('image/avif'))", "background: image('relative.jpg');", "background: image('https://example.com/https.jpg');", "background: image(rtl 'https://example.com/https.jpg');" ].each do |css| _(@default.properties(css)).must_equal "" _(@relaxed.properties(css)).must_equal css _(@custom.properties(css)).must_equal "" end end it "should not allow non-allowlisted URL protocols" do [ "background: url(javascript:alert(0))", "background: url(ja\\56 ascript:alert(0))", "background: url('javascript:foo')", "background: url('ja\\56 ascript:alert(0)')", "background: url('ja\\va\\script\\:alert(0)')", "background: url('javas\\\ncript:alert(0)')", "background: url('java\\0script:foo')" ].each do |css| _(@default.properties(css)).must_equal "" _(@relaxed.properties(css)).must_equal "" _(@custom.properties(css)).must_equal "" end end it "should not allow -moz-binding" do css = "-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')" _(@default.properties(css)).must_equal "" _(@relaxed.properties(css)).must_equal "" _(@custom.properties(css)).must_equal "" end it "should not allow expressions" do [ "width:expression(alert(1))", "width: /**/expression(alert(1)", "width:e\\78 pression(\n\nalert(\n1)", "width:\nexpression(alert(1));", "xss:expression(alert(1))", "height: foo(expression(alert(1)));" ].each do |css| _(@default.properties(css)).must_equal "" _(@relaxed.properties(css)).must_equal "" _(@custom.properties(css)).must_equal "" end end it "should not allow behaviors" do css = "behavior: url(xss.htc);" _(@default.properties(css)).must_equal "" _(@relaxed.properties(css)).must_equal "" _(@custom.properties(css)).must_equal "" end describe "when :allow_comments is true" do it "should preserve comments" do _(@relaxed.properties("color: #fff; /* comment */ width: 100px;")) .must_equal "color: #fff; /* comment */ width: 100px;" _(@relaxed.properties("color: #fff; /* \n\ncomment */ width: 100px;")) .must_equal "color: #fff; /* \n\ncomment */ width: 100px;" end end describe "when :allow_comments is false" do it "should strip comments" do _(@custom.properties("color: #fff; /* comment */ width: 100px;")) .must_equal "color: #fff; width: 100px;" _(@custom.properties("color: #fff; /* \n\ncomment */ width: 100px;")) .must_equal "color: #fff; width: 100px;" end end describe "when :allow_hacks is true" do it "should allow common CSS hacks" do _(@relaxed.properties("_border: 1px solid #fff; *width: 10px")) .must_equal "_border: 1px solid #fff; *width: 10px" end end describe "when :allow_hacks is false" do it "should not allow common CSS hacks" do _(@custom.properties("_border: 1px solid #fff; *width: 10px")) .must_equal " " end end end describe "#stylesheet" do it "should sanitize a CSS stylesheet" do css = %[ /* Yay CSS! */ .foo { color: #fff; } #bar { background: url(yay.jpg); } @media screen (max-width:480px) { .foo { width: 400px; } #bar:not(.baz) { height: 100px; } } ].strip _(@default.stylesheet(css).strip).must_equal %( .foo { } #bar { } ).strip _(@relaxed.stylesheet(css)).must_equal css _(@custom.stylesheet(css).strip).must_equal %( .foo { color: #fff; } #bar { } ).strip end describe "when :allow_comments is true" do it "should preserve comments" do _(@relaxed.stylesheet(".foo { color: #fff; /* comment */ width: 100px; }")) .must_equal ".foo { color: #fff; /* comment */ width: 100px; }" _(@relaxed.stylesheet(".foo { color: #fff; /* \n\ncomment */ width: 100px; }")) .must_equal ".foo { color: #fff; /* \n\ncomment */ width: 100px; }" end end describe "when :allow_comments is false" do it "should strip comments" do _(@custom.stylesheet(".foo { color: #fff; /* comment */ width: 100px; }")) .must_equal ".foo { color: #fff; width: 100px; }" _(@custom.stylesheet(".foo { color: #fff; /* \n\ncomment */ width: 100px; }")) .must_equal ".foo { color: #fff; width: 100px; }" end end describe "when :allow_hacks is true" do it "should allow common CSS hacks" do _(@relaxed.stylesheet(".foo { _border: 1px solid #fff; *width: 10px }")) .must_equal ".foo { _border: 1px solid #fff; *width: 10px }" end end describe "when :allow_hacks is false" do it "should not allow common CSS hacks" do _(@custom.stylesheet(".foo { _border: 1px solid #fff; *width: 10px }")) .must_equal ".foo { }" end end end describe "#tree!" do it "should sanitize a Crass CSS parse tree" do tree = Crass.parse("@import url(foo.css);\n" \ ".foo { background: #fff; font: 16pt 'Comic Sans MS'; }\n" \ "#bar { top: 125px; background: green; }") _(@custom.tree!(tree)).must_be_same_as tree _(Crass::Parser.stringify(tree)).must_equal "\n" \ ".foo { background: #fff; }\n" \ "#bar { background: green; }" end end end describe "class methods" do describe ".properties" do it "should sanitize CSS properties with the given config" do css = 'background: #fff; width: expression(alert("hi"));' _(Sanitize::CSS.properties(css)).must_equal " " _(Sanitize::CSS.properties(css, Sanitize::Config::RELAXED[:css])).must_equal "background: #fff; " _(Sanitize::CSS.properties(css, properties: %w[background color width])).must_equal "background: #fff; " end end describe ".stylesheet" do it "should sanitize a CSS stylesheet with the given config" do css = %[ /* Yay CSS! */ .foo { color: #fff; } #bar { background: url(yay.jpg); } @media screen (max-width:480px) { .foo { width: 400px; } #bar:not(.baz) { height: 100px; } } ].strip _(Sanitize::CSS.stylesheet(css).strip).must_equal %( .foo { } #bar { } ).strip _(Sanitize::CSS.stylesheet(css, Sanitize::Config::RELAXED[:css])).must_equal css _(Sanitize::CSS.stylesheet(css, properties: %w[background color width]).strip).must_equal %( .foo { color: #fff; } #bar { } ).strip end end describe ".tree!" do it "should sanitize a Crass CSS parse tree with the given config" do tree = Crass.parse("@import url(foo.css);\n" \ ".foo { background: #fff; font: 16pt 'Comic Sans MS'; }\n" \ "#bar { top: 125px; background: green; }") _(Sanitize::CSS.tree!(tree, properties: %w[background color width])).must_be_same_as tree _(Crass::Parser.stringify(tree)).must_equal "\n" \ ".foo { background: #fff; }\n" \ "#bar { background: green; }" end end end describe "functionality" do before do @default = Sanitize::CSS.new @relaxed = Sanitize::CSS.new(Sanitize::Config::RELAXED[:css]) end # https://github.com/rgrove/sanitize/issues/121 it "should parse the contents of @media rules properly" do css = '@media { p[class="center"] { text-align: center; }}' _(@relaxed.stylesheet(css)).must_equal css css = %[ @media (max-width: 720px) { p.foo > .bar { float: right; width: expression(body.scrollLeft + 50 + 'px'); } #baz { color: green; } @media (orientation: portrait) { #baz { color: red; } } } ].strip _(@relaxed.stylesheet(css)).must_equal %[ @media (max-width: 720px) { p.foo > .bar { float: right; } #baz { color: green; } @media (orientation: portrait) { #baz { color: red; } } } ].strip end it "should parse @page rules properly" do css = %[ @page { margin: 2cm } /* All margins set to 2cm */ @page :right { @top-center { content: "Preliminary edition" } @bottom-center { content: counter(page) } } @page { size: 8.5in 11in; margin: 10%; @top-left { content: "Hamlet"; } @top-right { content: "Page " counter(page); } } ].strip _(@relaxed.stylesheet(css)).must_equal css end describe ":at_rules" do it "should remove blockless at-rules that aren't allowlisted" do css = %[ @charset 'utf-8'; @import url('foo.css'); .foo { color: green; } ].strip _(@relaxed.stylesheet(css).strip).must_equal %( .foo { color: green; } ).strip end it "preserves allowlisted @container at-rules" do # Sample code courtesy of MDN: # https://developer.mozilla.org/en-US/docs/Web/CSS/@container css = %( @container (width > 400px) { h2 { font-size: 1.5em; } } /* with an optional */ @container tall (height > 30rem) { h2 { line-height: 1.6; } } /* multiple queries in a single condition */ @container (width > 400px) and style(--responsive: true) { h2 { font-size: 1.5em; } } /* condition list */ @container card (width > 400px), style(--responsive: true) { h2 { font-size: 1.5em; } } ).strip _(@relaxed.stylesheet(css).strip).must_equal css end describe "when blockless at-rules are allowlisted" do before do @scss = Sanitize::CSS.new(Sanitize::Config.merge(Sanitize::Config::RELAXED[:css], { at_rules: ["charset", "import"] })) end it "should not remove them" do css = %[ @charset 'utf-8'; @import url('foo.css'); .foo { color: green; } ].strip _(@scss.stylesheet(css)).must_equal %[ @charset 'utf-8'; @import url('foo.css'); .foo { color: green; } ].strip end it "should remove them if they have invalid blocks" do css = %( @charset { color: green } @import { color: green } .foo { color: green; } ).strip _(@scss.stylesheet(css).strip).must_equal %( .foo { color: green; } ).strip end end describe "when validating @import rules" do describe "with no validation proc specified" do before do @scss = Sanitize::CSS.new(Sanitize::Config.merge(Sanitize::Config::RELAXED[:css], { at_rules: ["import"] })) end it "should allow any URL value" do css = %[ @import url('https://somesite.com/something.css'); ].strip _(@scss.stylesheet(css).strip).must_equal %[ @import url('https://somesite.com/something.css'); ].strip end end describe "with a validation proc specified" do before do google_font_validator = proc { |url| url.start_with?("https://fonts.googleapis.com") } @scss = Sanitize::CSS.new(Sanitize::Config.merge(Sanitize::Config::RELAXED[:css], { at_rules: ["import"], import_url_validator: google_font_validator })) end it "should allow a google fonts url" do css = %[ @import 'https://fonts.googleapis.com/css?family=Indie+Flower'; @import url('https://fonts.googleapis.com/css?family=Indie+Flower'); ].strip _(@scss.stylesheet(css).strip).must_equal %[ @import 'https://fonts.googleapis.com/css?family=Indie+Flower'; @import url('https://fonts.googleapis.com/css?family=Indie+Flower'); ].strip end it "should not allow a nasty url" do css = %[ @import 'https://fonts.googleapis.com/css?family=Indie+Flower'; @import 'https://nastysite.com/nasty_hax0r.css'; @import url('https://nastysite.com/nasty_hax0r.css'); ].strip _(@scss.stylesheet(css).strip).must_equal %( @import 'https://fonts.googleapis.com/css?family=Indie+Flower'; ).strip end it "should not allow a blank url" do css = %[ @import 'https://fonts.googleapis.com/css?family=Indie+Flower'; @import ''; @import url(''); ].strip _(@scss.stylesheet(css).strip).must_equal %( @import 'https://fonts.googleapis.com/css?family=Indie+Flower'; ).strip end end end end end end sanitize-7.0.0/test/test_config.rb0000644000004100000410000000357714744072357017235 0ustar www-datawww-data# frozen_string_literal: true require_relative "common" describe "Config" do make_my_diffs_pretty! parallelize_me! def verify_deeply_frozen(config) _(config).must_be :frozen? if Hash === config config.each_value { |v| verify_deeply_frozen(v) } elsif Set === config || Array === config config.each { |v| verify_deeply_frozen(v) } end end it "built-in configs should be deeply frozen" do verify_deeply_frozen Sanitize::Config::DEFAULT verify_deeply_frozen Sanitize::Config::BASIC verify_deeply_frozen Sanitize::Config::RELAXED verify_deeply_frozen Sanitize::Config::RESTRICTED end describe ".freeze_config" do it "should deeply freeze and return a configuration Hash" do a = {one: {one_one: [0, "1", :a], one_two: false, one_three: Set.new([:a, :b, :c])}} b = Sanitize::Config.freeze_config(a) _(b).must_be_same_as a verify_deeply_frozen a end end describe ".merge" do it "should deeply merge a configuration Hash" do # Freeze to ensure that we get an error if either Hash is modified. a = Sanitize::Config.freeze_config({one: {one_one: [0, "1", :a], one_two: false, one_three: Set.new([:a, :b, :c])}}) b = Sanitize::Config.freeze_config({one: {one_two: true, one_three: 3}, two: 2}) c = Sanitize::Config.merge(a, b) _(c).wont_be_same_as a _(c).wont_be_same_as b _(c).must_equal( one: { one_one: [0, "1", :a], one_two: true, one_three: 3 }, two: 2 ) _(c[:one]).wont_be_same_as a[:one] _(c[:one][:one_one]).wont_be_same_as a[:one][:one_one] end it "should raise an ArgumentError if either argument is not a Hash" do _(proc { Sanitize::Config.merge("foo", {}) }).must_raise ArgumentError _(proc { Sanitize::Config.merge({}, "foo") }).must_raise ArgumentError end end end sanitize-7.0.0/test/test_parser.rb0000644000004100000410000000440414744072357017252 0ustar www-datawww-data# frozen_string_literal: true require_relative "common" describe "Parser" do make_my_diffs_pretty! parallelize_me! it "should translate valid entities into characters" do _(Sanitize.fragment("'é&")).must_equal("'Ă©&") end it "should translate orphaned ampersands into entities" do _(Sanitize.fragment("at&t")).must_equal("at&t") end it "should not add newlines after tags when serializing a fragment" do _(Sanitize.fragment("
foo\n\n

bar

\nbaz
quux
", elements: ["div", "p"])) .must_equal "
foo\n\n

bar

\nbaz
quux
" end it "should not have the Nokogiri 1.4.2+ unterminated script/style element bug" do _(Sanitize.fragment("foo ', default: "Lorem ipsum dolor sit amet ", restricted: "Lorem ipsum dolor sit amet ", basic: 'Lorem ipsum dolor sit
amet ', relaxed: 'Lorem ipsum dolor sit
amet ' }, malformed: { html: 'Lorem
dolor sit
amet ', default: 'Lorem ipsum dolor sit amet <script>alert("hello world");', restricted: 'Lorem ipsum dolor sit amet <script>alert("hello world");', basic: 'Lorem ipsum dolor sit
amet <script>alert("hello world");', relaxed: 'Lorem ipsum dolor sit
amet <script>alert("hello world");' } } protocols = { "protocol-based JS injection: simple, no spaces" => { html: 'foo', default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: simple, spaces before" => { html: 'foo', default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: simple, spaces after" => { html: 'foo', default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: simple, spaces before and after" => { html: 'foo', default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: preceding colon" => { html: 'foo', default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: UTF-8 encoding" => { html: 'foo', default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: long UTF-8 encoding" => { html: 'foo', default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: long UTF-8 encoding without semicolons" => { html: "foo", default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: hex encoding" => { html: 'foo', default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: long hex encoding" => { html: 'foo', default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: hex encoding without semicolons" => { html: "foo", default: "foo", restricted: "foo", basic: 'foo', relaxed: "foo" }, "protocol-based JS injection: null char" => { html: "", default: "", restricted: "", basic: "", relaxed: "" }, "protocol-based JS injection: invalid URL char" => { html: '', default: "", restricted: "", basic: "", relaxed: "" }, "protocol-based JS injection: spaces and entities" => { html: '', default: "", restricted: "", basic: "", relaxed: "" }, "protocol whitespace" => { html: '', default: "", restricted: "", basic: '', relaxed: '' } } describe "Default config" do it "should remove non-allowlisted elements, leaving safe contents behind" do _(Sanitize.fragment('foo bar baz quux')) .must_equal "foo bar baz quux" _(Sanitize.fragment('')) .must_equal "" _(Sanitize.fragment('<')) .must_equal '< script <>> alert("");' end it "should surround the contents of :whitespace_elements with space characters when removing the element" do _(Sanitize.fragment("foo
bar
baz")) .must_equal "foo bar baz" _(Sanitize.fragment("foo
bar
baz")) .must_equal "foo bar baz" _(Sanitize.fragment("foo
bar
baz")) .must_equal "foo bar baz" end it "should not choke on several instances of the same element in a row" do _(Sanitize.fragment('')) .must_equal "" end it "should not preserve the content of removed `iframe` elements" do _(Sanitize.fragment("")) .must_equal "" end it "should not preserve the content of removed `math` elements" do _(Sanitize.fragment("hello! ")) .must_equal "" end it "should not preserve the content of removed `noembed` elements" do _(Sanitize.fragment("hello! <script>alert(0)</script>")) .must_equal "" end it "should not preserve the content of removed `noframes` elements" do _(Sanitize.fragment("hello! <script>alert(0)</script>")) .must_equal "" end it "should not preserve the content of removed `noscript` elements" do _(Sanitize.fragment("")) .must_equal "" end it "should not preserve the content of removed `plaintext` elements" do _(Sanitize.fragment("hello! <script>alert(0)</script>")) .must_equal "" end it "should not preserve the content of removed `script` elements" do _(Sanitize.fragment("<script>hello! <script>alert(0)</script></script>")) .must_equal "" end it "should not preserve the content of removed `style` elements" do _(Sanitize.fragment("<style>hello! <script>alert(0)</script></style>")) .must_equal "" end it "should not preserve the content of removed `svg` elements" do _(Sanitize.fragment("<svg>hello! <script>alert(0)</script></svg>")) .must_equal "" end it "should not preserve the content of removed `xmp` elements" do _(Sanitize.fragment("<xmp>hello! <script>alert(0)</script></xmp>")) .must_equal "" end strings.each do |name, data| it "should clean #{name} HTML" do _(Sanitize.fragment(data[:html])).must_equal(data[:default]) end end protocols.each do |name, data| it "should not allow #{name}" do _(Sanitize.fragment(data[:html])).must_equal(data[:default]) end end end describe "Restricted config" do before do @s = Sanitize.new(Sanitize::Config::RESTRICTED) end strings.each do |name, data| it "should clean #{name} HTML" do _(@s.fragment(data[:html])).must_equal(data[:restricted]) end end protocols.each do |name, data| it "should not allow #{name}" do _(@s.fragment(data[:html])).must_equal(data[:restricted]) end end end describe "Basic config" do before do @s = Sanitize.new(Sanitize::Config::BASIC) end it "should not choke on valueless attributes" do _(@s.fragment("foo <a href>foo</a> bar")) .must_equal 'foo <a href="" rel="nofollow">foo</a> bar' end it "should downcase attribute names" do _(@s.fragment('<a HREF="javascript:alert(\'foo\')">bar</a>')) .must_equal '<a rel="nofollow">bar</a>' end strings.each do |name, data| it "should clean #{name} HTML" do _(@s.fragment(data[:html])).must_equal(data[:basic]) end end protocols.each do |name, data| it "should not allow #{name}" do _(@s.fragment(data[:html])).must_equal(data[:basic]) end end end describe "Relaxed config" do before do @s = Sanitize.new(Sanitize::Config::RELAXED) end it "should encode special chars in attribute values" do _(@s.fragment('<a href="http://example.com" title="<b>&eacute;xamples</b> & things">foo</a>')) .must_equal '<a href="http://example.com" title="<b>Ă©xamples</b> &amp; things">foo</a>' end strings.each do |name, data| it "should clean #{name} HTML" do _(@s.fragment(data[:html])).must_equal(data[:relaxed]) end end protocols.each do |name, data| it "should not allow #{name}" do _(@s.fragment(data[:html])).must_equal(data[:relaxed]) end end end describe "Custom configs" do it "should allow attributes on all elements if allowlisted under :all" do input = '<p class="foo">bar</p>' _(Sanitize.fragment(input)).must_equal " bar " _(Sanitize.fragment(input, { elements: ["p"], attributes: {all: ["class"]} })).must_equal input _(Sanitize.fragment(input, { elements: ["p"], attributes: {"div" => ["class"]} })).must_equal "<p>bar</p>" _(Sanitize.fragment(input, { elements: ["p"], attributes: {"p" => ["title"], :all => ["class"]} })).must_equal input end it "should not allow relative URLs when relative URLs aren't allowlisted" do input = '<a href="/foo/bar">Link</a>' _(Sanitize.fragment(input, elements: ["a"], attributes: {"a" => ["href"]}, protocols: {"a" => {"href" => ["http"]}})).must_equal "<a>Link</a>" end it "should allow relative URLs containing colons when the colon is not in the first path segment" do input = '<a href="/wiki/Special:Random">Random Page</a>' _(Sanitize.fragment(input, { elements: ["a"], attributes: {"a" => ["href"]}, protocols: {"a" => {"href" => [:relative]}} })).must_equal input end it "should allow relative URLs containing colons when the colon is part of an anchor" do input = '<a href="#fn:1">Footnote 1</a>' _(Sanitize.fragment(input, { elements: ["a"], attributes: {"a" => ["href"]}, protocols: {"a" => {"href" => [:relative]}} })).must_equal input input = '<a href="somepage#fn:1">Footnote 1</a>' _(Sanitize.fragment(input, { elements: ["a"], attributes: {"a" => ["href"]}, protocols: {"a" => {"href" => [:relative]}} })).must_equal input end it "should remove the contents of filtered nodes when :remove_contents is true" do _(Sanitize.fragment("foo bar <div>baz<span>quux</span></div>", remove_contents: true)).must_equal "foo bar " end it "should remove the contents of specified nodes when :remove_contents is an Array or Set of element names as strings" do _(Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>', remove_contents: ["script", "span"])).must_equal "foo bar baz hi " _(Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>', remove_contents: Set.new(["script", "span"]))).must_equal "foo bar baz hi " end it "should remove the contents of specified nodes when :remove_contents is an Array or Set of element names as symbols" do _(Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>', remove_contents: [:script, :span])).must_equal "foo bar baz hi " _(Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>', remove_contents: Set.new([:script, :span]))).must_equal "foo bar baz hi " end it "should remove the contents of allowlisted iframes" do _(Sanitize.fragment("<iframe>hi <script>hello</script></iframe>", elements: ["iframe"])).must_equal "<iframe></iframe>" end it "should not allow arbitrary HTML5 data attributes by default" do _(Sanitize.fragment('<b data-foo="bar"></b>', elements: ["b"])).must_equal "<b></b>" _(Sanitize.fragment('<b class="foo" data-foo="bar"></b>', attributes: {"b" => ["class"]}, elements: ["b"])).must_equal '<b class="foo"></b>' end it "should allow arbitrary HTML5 data attributes when the :attributes config includes :data" do s = Sanitize.new( attributes: {"b" => [:data]}, elements: ["b"] ) _(s.fragment('<b data-foo="valid" data-bar="valid"></b>')) .must_equal '<b data-foo="valid" data-bar="valid"></b>' _(s.fragment('<b data-="invalid"></b>')) .must_equal "<b></b>" _(s.fragment('<b data-="invalid"></b>')) .must_equal "<b></b>" _(s.fragment('<b data-xml="invalid"></b>')) .must_equal "<b></b>" _(s.fragment('<b data-xmlfoo="invalid"></b>')) .must_equal "<b></b>" _(s.fragment('<b data-f:oo="valid"></b>')) .must_equal "<b></b>" _(s.fragment('<b data-f/oo="partial"></b>')) .must_equal '<b data-f=""></b>' # Nokogiri quirk; not ideal, but harmless _(s.fragment('<b data-Ă©foo="valid"></b>')) .must_equal "<b></b>" # Another annoying Nokogiri quirk. end it "should replace whitespace_elements with configured :before and :after values" do s = Sanitize.new( whitespace_elements: { "p" => {before: "\n", after: "\n"}, "div" => {before: "\n", after: "\n"}, "br" => {before: "\n", after: "\n"} } ) _(s.fragment("<p>foo</p>")).must_equal "\nfoo\n" _(s.fragment("<p>foo</p><p>bar</p>")).must_equal "\nfoo\n\nbar\n" _(s.fragment("foo<div>bar</div>baz")).must_equal "foo\nbar\nbaz" _(s.fragment("foo<br>bar<br>baz")).must_equal "foo\nbar\nbaz" end it "should handle protocols correctly regardless of case" do input = '<a href="hTTpS://foo.com/">Text</a>' _(Sanitize.fragment(input, { elements: ["a"], attributes: {"a" => ["href"]}, protocols: {"a" => {"href" => ["https"]}} })).must_equal input input = '<a href="mailto:someone@example.com?Subject=Hello">Text</a>' _(Sanitize.fragment(input, { elements: ["a"], attributes: {"a" => ["href"]}, protocols: {"a" => {"href" => ["https"]}} })).must_equal "<a>Text</a>" end it "should sanitize protocols in data attributes even if data attributes are generically allowed" do input = '<a data-url="mailto:someone@example.com">Text</a>' _(Sanitize.fragment(input, { elements: ["a"], attributes: {"a" => [:data]}, protocols: {"a" => {"data-url" => ["https"]}} })).must_equal "<a>Text</a>" _(Sanitize.fragment(input, { elements: ["a"], attributes: {"a" => [:data]}, protocols: {"a" => {"data-url" => ["mailto"]}} })).must_equal input end it "should prevent `<meta>` tags from being used to set a non-UTF-8 charset" do _(Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>', elements: %w[html head meta body], attributes: {"meta" => ["charset"]})).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>" _(Sanitize.document('<html><meta charset="utf-8">Howdy!</html>', elements: %w[html meta], attributes: {"meta" => ["charset"]})).must_equal "<html><meta charset=\"utf-8\">Howdy!</html>" _(Sanitize.document('<html><meta charset="us-ascii">Howdy!</html>', elements: %w[html meta], attributes: {"meta" => ["charset"]})).must_equal "<html><meta charset=\"utf-8\">Howdy!</html>" _(Sanitize.document('<html><meta http-equiv="content-type" content=" text/html; charset=us-ascii">Howdy!</html>', elements: %w[html meta], attributes: {"meta" => %w[content http-equiv]})).must_equal "<html><meta http-equiv=\"content-type\" content=\" text/html;charset=utf-8\">Howdy!</html>" _(Sanitize.document('<html><meta http-equiv="Content-Type" content="text/plain;charset = us-ascii">Howdy!</html>', elements: %w[html meta], attributes: {"meta" => %w[content http-equiv]})).must_equal "<html><meta http-equiv=\"Content-Type\" content=\"text/plain;charset=utf-8\">Howdy!</html>" end it "should not modify `<meta>` tags that already set a UTF-8 charset" do _(Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>', elements: %w[html head meta body], attributes: {"meta" => %w[content http-equiv]})).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>" end it "always removes `<noscript>` elements even if `noscript` is in the allowlist" do assert_equal( "", Sanitize.fragment("<noscript>foo</noscript>", elements: ["noscript"]) ) end end end ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������sanitize-7.0.0/LICENSE������������������������������������������������������������������������������0000644�0000041�0000041�00000002063�14744072357�014417� 0����������������������������������������������������������������������������������������������������ustar �www-data������������������������www-data���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������MIT License Copyright Ryan Grove <ryan@wonko.com> Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������sanitize-7.0.0/README.md����������������������������������������������������������������������������0000644�0000041�0000041�00000050472�14744072357�014700� 0����������������������������������������������������������������������������������������������������ustar �www-data������������������������www-data���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# Sanitize Sanitize is an allowlist-based HTML and CSS sanitizer. It removes all HTML and/or CSS from a string except the elements, attributes, and properties you choose to allow. Using a simple configuration syntax, you can tell Sanitize to allow certain HTML elements, certain attributes within those elements, and even certain URL protocols within attributes that contain URLs. You can also allow specific CSS properties, @ rules, and URL protocols in elements or attributes containing CSS. Any HTML or CSS that you don't explicitly allow will be removed. Sanitize is based on the [Nokogiri HTML5 parser][nokogiri], which parses HTML the same way modern browsers do, and [Crass][crass], which parses CSS the same way modern browsers do. As long as your allowlist config only allows safe markup and CSS, even the most malformed or malicious input will be transformed into safe output. [![Gem Version](https://badge.fury.io/rb/sanitize.svg)](http://badge.fury.io/rb/sanitize) [![Tests](https://github.com/rgrove/sanitize/workflows/Tests/badge.svg)](https://github.com/rgrove/sanitize/actions?query=workflow%3ATests) [crass]:https://github.com/rgrove/crass [nokogiri]:https://github.com/sparklemotion/nokogiri ## Links * [Home](https://github.com/rgrove/sanitize/) * [API Docs](https://rubydoc.info/github/rgrove/sanitize/Sanitize) * [Issues](https://github.com/rgrove/sanitize/issues) * [Release History](https://github.com/rgrove/sanitize/releases) * [Online Demo](https://sanitize-web.fly.dev/) ## Installation ``` gem install sanitize ``` ## Quick Start ```ruby require 'sanitize' # Clean up an HTML fragment using Sanitize's permissive but safe Relaxed config. # This also sanitizes any CSS in `<style>` elements or `style` attributes. Sanitize.fragment(html, Sanitize::Config::RELAXED) # Clean up an HTML document using the Relaxed config. Sanitize.document(html, Sanitize::Config::RELAXED) # Clean up a standalone CSS stylesheet using the Relaxed config. Sanitize::CSS.stylesheet(css, Sanitize::Config::RELAXED) # Clean up some CSS properties using the Relaxed config. Sanitize::CSS.properties(css, Sanitize::Config::RELAXED) ``` ## Usage Sanitize can sanitize the following types of input: * HTML fragments * HTML documents * CSS stylesheets inside HTML `<style>` elements * CSS properties inside HTML `style` attributes * Standalone CSS stylesheets * Standalone CSS properties > [!WARNING] > > Sanitize cannot fully sanitize the contents of `<math>` or `<svg>` elements. MathML and SVG elements are [foreign elements](https://html.spec.whatwg.org/multipage/syntax.html#foreign-elements) that don't follow normal HTML parsing rules. > > By default, Sanitize will remove all MathML and SVG elements. If you add MathML or SVG elements to a custom element allowlist, you may create a security vulnerability in your application. ### HTML Fragments A fragment is a snippet of HTML that doesn't contain a root-level `<html>` element. If you don't specify any configuration options, Sanitize will use its strictest settings by default, which means it will strip all HTML and leave only safe text behind. ```ruby html = '<b><a href="http://foo.com/">foo</a></b><img src="bar.jpg">' Sanitize.fragment(html) # => "foo" ``` To keep certain elements, add them to the element allowlist. ```ruby Sanitize.fragment(html, elements: ['b']) # => "<b>foo</b>" ``` ### HTML Documents When sanitizing a document, the `<html>` element must be allowlisted. You can also set `:allow_doctype` to `true` to allow well-formed document type definitions. ```ruby html = %[ <!DOCTYPE html> <html> <b><a href="http://foo.com/">foo</a></b><img src="bar.jpg"> </html> ] Sanitize.document(html, allow_doctype: true, elements: ['html'] ) # => "<!DOCTYPE html><html>foo\n \n</html>" ``` ### CSS in HTML To sanitize CSS in an HTML fragment or document, first allowlist the `<style>` element and/or the `style` attribute. Then allowlist the CSS properties, @ rules, and URL protocols you wish to allow. You can also choose whether to allow CSS comments or browser compatibility hacks. ```ruby html = %[ <style> div { color: green; width: 1024px; } </style> <div style="height: 100px; width: 100px;"></div> <p>hello!</p> ] Sanitize.fragment(html, elements: ['div', 'style'], attributes: {'div' => ['style']}, css: { properties: ['width'] } ) #=> %[ # <style> # div { width: 1024px; } # </style> # # <div style=" width: 100px;"></div> # hello! # ] ``` ### Standalone CSS Sanitize will happily clean up a standalone CSS stylesheet or property string without needing to invoke the HTML parser. ```ruby css = %[ @import url(evil.css); a { text-decoration: none; } a:hover { left: expression(alert('xss!')); text-decoration: underline; } ] Sanitize::CSS.stylesheet(css, Sanitize::Config::RELAXED) # => %[ # # # a { text-decoration: none; } # # a:hover { # # text-decoration: underline; # } # ] Sanitize::CSS.properties(%[ left: expression(alert('xss!')); text-decoration: underline; ], Sanitize::Config::RELAXED) # => %[ # # text-decoration: underline; # ] ``` ## Configuration In addition to the ultra-safe default settings, Sanitize comes with three other built-in configurations that you can use out of the box or adapt to meet your needs. ### Sanitize::Config::RESTRICTED Allows only very simple inline markup. No links, images, or block elements. ```ruby Sanitize.fragment(html, Sanitize::Config::RESTRICTED) # => "<b>foo</b>" ``` ### Sanitize::Config::BASIC Allows a variety of markup including formatting elements, links, and lists. Images and tables are not allowed, links are limited to FTP, HTTP, HTTPS, and mailto protocols, and a `rel="nofollow"` attribute is added to all links to mitigate SEO spam. ```ruby Sanitize.fragment(html, Sanitize::Config::BASIC) # => '<b><a href="http://foo.com/" rel="nofollow">foo</a></b>' ``` ### Sanitize::Config::RELAXED Allows an even wider variety of markup, including images and tables, as well as safe CSS. Links are still limited to FTP, HTTP, HTTPS, and mailto protocols, while images are limited to HTTP and HTTPS. In this mode, `rel="nofollow"` is not added to links. ```ruby Sanitize.fragment(html, Sanitize::Config::RELAXED) # => '<b><a href="http://foo.com/">foo</a></b><img src="bar.jpg">' ``` ### Custom Configuration If the built-in modes don't meet your needs, you can easily specify a custom configuration: ```ruby Sanitize.fragment(html, elements: ['a', 'span'], attributes: { 'a' => ['href', 'title'], 'span' => ['class'] }, protocols: { 'a' => {'href' => ['http', 'https', 'mailto']} } ) ``` You can also start with one of Sanitize's built-in configurations and then customize it to meet your needs. The built-in configs are deeply frozen to prevent people from modifying them (either accidentally or maliciously). To customize a built-in config, create a new copy using `Sanitize::Config.merge()`, like so: ```ruby # Create a customized copy of the Basic config, adding <div> and <table> to the # existing allowlisted elements. Sanitize.fragment(html, Sanitize::Config.merge(Sanitize::Config::BASIC, elements: Sanitize::Config::BASIC[:elements] + ['div', 'table'], remove_contents: true )) ``` The example above adds the `<div>` and `<table>` elements to a copy of the existing list of elements in `Sanitize::Config::BASIC`. If you instead want to completely overwrite the elements array with your own, you can omit the `+` operation: ```ruby # Overwrite :elements instead of creating a copy with new entries. Sanitize.fragment(html, Sanitize::Config.merge(Sanitize::Config::BASIC, elements: ['div', 'table'], remove_contents: true )) ``` ### Config Settings #### :add_attributes (Hash) Attributes to add to specific elements. If the attribute already exists, it will be replaced with the value specified here. Specify all element names and attributes in lowercase. ```ruby add_attributes: { 'a' => {'rel' => 'nofollow'} } ``` #### :allow_comments (boolean) Whether or not to allow HTML comments. Allowing comments is strongly discouraged, since IE allows script execution within conditional comments. The default value is `false`. #### :allow_doctype (boolean) Whether or not to allow well-formed HTML doctype declarations such as "<!DOCTYPE html>" when sanitizing a document. This setting is ignored when sanitizing fragments. The default value is `false`. #### :attributes (Hash) Attributes to allow on specific elements. Specify all element names and attributes in lowercase. ```ruby attributes: { 'a' => ['href', 'title'], 'blockquote' => ['cite'], 'img' => ['alt', 'src', 'title'] } ``` If you'd like to allow certain attributes on all elements, use the symbol `:all` instead of an element name. ```ruby # Allow the class attribute on all elements. attributes: { :all => ['class'], 'a' => ['href', 'title'] } ``` To allow arbitrary HTML5 `data-*` attributes, use the symbol `:data` in place of an attribute name. ```ruby # Allow arbitrary HTML5 data-* attributes on <div> elements. attributes: { 'div' => [:data] } ``` #### :css (Hash) Hash of the following CSS config settings to be used when sanitizing CSS (either standalone or embedded in HTML). ##### :css => :allow_comments (boolean) Whether or not to allow CSS comments. The default value is `false`. ##### :css => :allow_hacks (boolean) Whether or not to allow browser compatibility hacks such as the IE `*` and `_` hacks. These are generally harmless, but technically result in invalid CSS. The default is `false`. ##### :css => :at_rules (Array or Set) Names of CSS [at-rules][at-rules] to allow that may not have associated blocks, such as `import` or `charset`. Names should be specified in lowercase. [at-rules]:https://developer.mozilla.org/en-US/docs/Web/CSS/At-rule ##### :css => :at_rules_with_properties (Array or Set) Names of CSS [at-rules][at-rules] to allow that may have associated blocks containing CSS properties. At-rules like `font-face` and `page` fall into this category. Names should be specified in lowercase. ##### :css => :at_rules_with_styles (Array or Set) Names of CSS [at-rules][at-rules] to allow that may have associated blocks containing style rules. At-rules like `media` and `keyframes` fall into this category. Names should be specified in lowercase. ##### :css => :import_url_validator This is a `Proc` (or other callable object) that will be called and passed the URL specified for any `@import` [at-rules][at-rules]. You can use this to limit what can be imported, for example something like the following to limit `@import` to Google Fonts URLs: ```ruby Proc.new { |url| url.start_with?("https://fonts.googleapis.com") } ``` ##### :css => :properties (Array or Set) List of CSS property names to allow. Names should be specified in lowercase. ##### :css => :protocols (Array or Set) URL protocols to allow in CSS URLs. Should be specified in lowercase. If you'd like to allow the use of relative URLs which don't have a protocol, include the symbol `:relative` in the protocol array. #### :elements (Array or Set) Array of HTML element names to allow. Specify all names in lowercase. Any elements not in this array will be removed. ```ruby elements: %w[ a abbr b blockquote br cite code dd dfn dl dt em i kbd li mark ol p pre q s samp small strike strong sub sup time u ul var ] ``` > [!WARNING] > > Sanitize cannot fully sanitize the contents of `<math>` or `<svg>` elements. MathML and SVG elements are [foreign elements](https://html.spec.whatwg.org/multipage/syntax.html#foreign-elements) that don't follow normal HTML parsing rules. > > By default, Sanitize will remove all MathML and SVG elements. If you add MathML or SVG elements to a custom element allowlist, you must assume that any content inside them will be allowed, even if that content would otherwise be removed or escaped by Sanitize. This may create a security vulnerability in your application. > [!NOTE] > > Sanitize always removes `<noscript>` elements and their contents, even if `noscript` is in the allowlist. > > This is because a `<noscript>` element's content is parsed differently in browsers depending on whether or not scripting is enabled. Since Nokogiri doesn't support scripting, it always parses `<noscript>` elements as if scripting is disabled. This results in edge cases where it's not possible to reliably sanitize the contents of a `<noscript>` element because Nokogiri can't fully replicate the parsing behavior of a scripting-enabled browser. #### :parser_options (Hash) [Parsing options](https://nokogiri.org/tutorials/parsing_an_html5_document.html?h=parsing+options#parsing-options) to be supplied to Nokogiri. ```ruby parser_options: { max_errors: -1, max_tree_depth: -1 } ``` #### :protocols (Hash) URL protocols to allow in specific attributes. If an attribute is listed here and contains a protocol other than those specified (or if it contains no protocol at all), it will be removed. ```ruby protocols: { 'a' => {'href' => ['ftp', 'http', 'https', 'mailto']}, 'img' => {'src' => ['http', 'https']} } ``` If you'd like to allow the use of relative URLs which don't have a protocol, include the symbol `:relative` in the protocol array: ```ruby protocols: { 'a' => {'href' => ['http', 'https', :relative]} } ``` #### :remove_contents (boolean or Array or Set) If this is `true`, Sanitize will remove the contents of any non-allowlisted elements in addition to the elements themselves. By default, Sanitize leaves the safe parts of an element's contents behind when the element is removed. If this is an Array or Set of element names, then only the contents of the specified elements (when filtered) will be removed, and the contents of all other filtered elements will be left behind. The default value can be seen in the [default config](lib/sanitize/config/default.rb). #### :transformers (Array or callable) Custom HTML transformer or array of custom transformers. See the Transformers section below for details. #### :whitespace_elements (Hash) Hash of element names which, when removed, should have their contents surrounded by whitespace to preserve readability. Each element name is a key pointing to another Hash, which provides the specific whitespace that should be inserted `:before` and `:after` the removed element's position. The `:after` value will only be inserted if the removed element has children, in which case it will be inserted after those children. ```ruby whitespace_elements: { 'br' => { before: "\n", after: "" }, 'div' => { before: "\n", after: "\n" }, 'p' => { before: "\n", after: "\n" } } ``` The default elements with whitespace added before and after can be seen in [the default config](lib/sanitize/config/default.rb). ## Transformers Transformers allow you to filter and modify HTML nodes using your own custom logic, on top of (or instead of) Sanitize's core filter. A transformer is any object that responds to `call()` (such as a lambda or proc). To use one or more transformers, pass them to the `:transformers` config setting. You may pass a single transformer or an array of transformers. ```ruby Sanitize.fragment(html, transformers: [ transformer_one, transformer_two ]) ``` ### Input Each transformer's `call()` method will be called once for each node in the HTML (including elements, text nodes, comments, etc.), and will receive as an argument a Hash that contains the following items: * **:config** - The current Sanitize configuration Hash. * **:is_allowlisted** - `true` if the current node has been allowlisted by a previous transformer, `false` otherwise. It's generally bad form to remove a node that a previous transformer has allowlisted. * **:node** - A `Nokogiri::XML::Node` object representing an HTML node. The node may be an element, a text node, a comment, a CDATA node, or a document fragment. Use Nokogiri's inspection methods (`element?`, `text?`, etc.) to selectively ignore node types you aren't interested in. * **:node_allowlist** - Set of `Nokogiri::XML::Node` objects in the current document that have been allowlisted by previous transformers, if any. It's generally bad form to remove a node that a previous transformer has allowlisted. * **:node_name** - The name of the current HTML node, always lowercase (e.g. "div" or "span"). For non-element nodes, the name will be something like "text", "comment", "#cdata-section", "#document-fragment", etc. ### Output A transformer doesn't have to return anything, but may optionally return a Hash, which may contain the following items: * **:node_allowlist** - Array or Set of specific `Nokogiri::XML::Node` objects to add to the document's allowlist, bypassing the current Sanitize config. These specific nodes and all their attributes will be allowlisted, but their children will not be. If a transformer returns anything other than a Hash, the return value will be ignored. ### Processing Each transformer has full access to the `Nokogiri::XML::Node` that's passed into it and to the rest of the document via the node's `document()` method. Any changes made to the current node or to the document will be reflected instantly in the document and passed on to subsequently called transformers and to Sanitize itself. A transformer may even call Sanitize internally to perform custom sanitization if needed. Nodes are passed into transformers in the order in which they're traversed. Sanitize performs top-down traversal, meaning that nodes are traversed in the same order you'd read them in the HTML, starting at the top node, then its first child, and so on. ```ruby html = %[ <header> <span> <strong>foo</strong> </span> <p>bar</p> </header> <footer></footer> ] transformer = lambda do |env| puts env[:node_name] if env[:node].element? end # Prints "header", "span", "strong", "p", "footer". Sanitize.fragment(html, transformers: transformer) ``` Transformers have a tremendous amount of power, including the power to completely bypass Sanitize's built-in filtering. Be careful! Your safety is in your own hands. ### Example: Transformer to allow image URLs by domain The following example demonstrates how to remove image elements unless they use a relative URL or are hosted on a specific domain. It assumes that the `<img>` element and its `src` attribute are already allowlisted. ```ruby require "uri" image_allowlist_transformer = lambda do |env| # Ignore everything except <img> elements. return unless env[:node_name] == "img" node = env[:node] image_uri = URI.parse(node["src"]) # Only allow relative URLs or URLs with the example.com domain. The # image_uri.host.nil? check ensures that protocol-relative URLs like # "//evil.com/foo.jpg" are not allowed. unless image_uri.host == "example.com" unless image_uri.host.nil? && image_uri.relative? node.unlink # `Nokogiri::XML::Node#unlink` removes a node from the document end end end ``` ### Example: Transformer to allow YouTube video embeds The following example demonstrates how to create a transformer that will safely allow valid YouTube video embeds without having to allow other kinds of embedded content, which would be the case if you tried to do this by just allowing all `<iframe>` elements: ```ruby youtube_transformer = lambda do |env| node = env[:node] node_name = env[:node_name] # Don't continue if this node is already allowlisted or is not an element. return if env[:is_allowlisted] || !node.element? # Don't continue unless the node is an iframe. return unless node_name == "iframe" # Verify that the video URL is actually a valid YouTube video URL. return unless %r{\A(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/}.match?(node["src"]) # We're now certain that this is a YouTube embed, but we still need to run # it through a special Sanitize step to ensure that no unwanted elements or # attributes that don't belong in a YouTube embed can sneak in. Sanitize.node!(node, { elements: %w[iframe], attributes: { "iframe" => %w[allowfullscreen frameborder height src width] } }) # Now that we're sure that this is a valid YouTube embed and that there are # no unwanted elements or attributes hidden inside it, we can tell Sanitize # to allowlist the current node. {node_allowlist: [node]} end html = %[ <iframe width="420" height="315" src="//www.youtube.com/embed/dQw4w9WgXcQ" frameborder="0" allowfullscreen></iframe> ].strip Sanitize.fragment(html, transformers: youtube_transformer) # => '<iframe width="420" height="315" src="//www.youtube.com/embed/dQw4w9WgXcQ" frameborder="0" allowfullscreen=""></iframe>' ``` ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������sanitize-7.0.0/CHANGELOG.md�������������������������������������������������������������������������0000644�0000041�0000041�00000071541�14744072357�015232� 0����������������������������������������������������������������������������������������������������ustar �www-data������������������������www-data���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# Sanitize Changelog All notable changes to Sanitize are documented in this file. The format (since version 7.0.0) is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## 7.0.0 (2024-12-29) Sanitize has no breaking API changes in this release, but the major version number has been incremented because we've dropped support for end-of-life versions of Ruby. As long as you're using Ruby 3.1.0 or later, this should be a painless upgrade! ### Added - Added over 100 new CSS properties to the relaxed config, representing all properties that are listed with a status of "Working Draft" or better in the latest [W3C "All Properties" list](https://www.w3.org/Style/CSS/all-properties.en.html). - Added the `@container` CSS at-rule to the relaxed config. - Added the `-webkit-text-fill-color` CSS property to the relaxed config. [@radar - #244](https://github.com/rgrove/sanitize/pull/244) ### Changed - Ruby 3.1.0 is now the oldest supported Ruby version. - Sanitize now requires Nokogiri 1.16.8 or higher. ## 6.1.3 (2024-08-14) ### Bug Fixes * The CSS URL protocol allowlist is now enforced on the nonstandard `-webkit-image-set` CSS function. [@ltk - #242][242] [242]:https://github.com/rgrove/sanitize/pull/242 ## 6.1.2 (2024-07-27) ### Bug Fixes * The CSS URL protocol allowlist is now properly enforced in [CSS Images Module Level 4](https://drafts.csswg.org/css-images-4/) `image` and `image-set` functions. [@ltk - #240][240] [240]:https://github.com/rgrove/sanitize/pull/240 ## 6.1.1 (2024-06-12) ### Bug Fixes * Proactively fixed a compatibility issue with libxml >= 2.13.0 (which will be used in an upcoming version of Nokogiri) that caused HTML doctype sanitization to fail. [@flavorjones - #238][238] [238]:https://github.com/rgrove/sanitize/pull/238 ## 6.1.0 (2023-09-14) ### Features * Added the `text-decoration-skip-ink` and `text-decoration-thickness` CSS properties to the relaxed config. [@martineriksson - #228][228] [228]:https://github.com/rgrove/sanitize/pull/228 ## 6.0.2 (2023-07-06) ### Bug Fixes * CVE-2023-36823: Fixed an HTML+CSS sanitization bypass that could allow XSS (cross-site scripting). This issue affects Sanitize versions 3.0.0 through 6.0.1. When using Sanitize's relaxed config or a custom config that allows `<style>` elements and one or more CSS at-rules, carefully crafted input could be used to sneak arbitrary HTML through Sanitize. See the following security advisory for additional details: [GHSA-f5ww-cq3m-q3g7](https://github.com/rgrove/sanitize/security/advisories/GHSA-f5ww-cq3m-q3g7) Thanks to @cure53 for finding this issue. ## 6.0.1 (2023-01-27) ### Bug Fixes * Sanitize now always removes `<noscript>` elements and their contents, even when `noscript` is in the allowlist. This fixes a sanitization bypass that could occur when `noscript` was allowed by a custom allowlist. In this scenario, carefully crafted input could sneak arbitrary HTML through Sanitize, potentially enabling an XSS (cross-site scripting) attack. Sanitize's default configs don't allow `<noscript>` elements and are not vulnerable. This issue only affects users who are using a custom config that adds `noscript` to the element allowlist. The root cause of this issue is that HTML parsing rules treat the contents of a `<noscript>` element differently depending on whether scripting is enabled in the user agent. Nokogiri doesn't support scripting so it follows the "scripting disabled" rules, but a web browser with scripting enabled will follow the "scripting enabled" rules. This means that Sanitize can't reliably make the contents of a `<noscript>` element safe for scripting enabled browsers, so the safest thing to do is to remove the element and its contents entirely. See the following security advisory for additional details: [GHSA-fw3g-2h3j-qmm7](https://github.com/rgrove/sanitize/security/advisories/GHSA-fw3g-2h3j-qmm7) Thanks to David Klein from [TU Braunschweig](https://www.tu-braunschweig.de/en/ias) (@leeN) for reporting this issue. * Fixed an edge case in which the contents of an "unescaped text" element (such as `<noembed>` or `<xmp>`) were not properly escaped if that element was allowlisted and was also inside an allowlisted `<math>` or `<svg>` element. The only way to encounter this situation was to ignore multiple warnings in the readme and create a custom config that allowlisted all the elements involved, including `<math>` or `<svg>`. If you're using a default config or if you heeded the warnings about MathML and SVG not being supported, you're not affected by this issue. Please let this be a reminder that Sanitize cannot safely sanitize MathML or SVG content and does not support this use case. The default configs don't allow MathML or SVG elements, and allowlisting MathML or SVG elements in a custom config may create a security vulnerability in your application. Documentation has been updated to add more warnings and to make the existing warnings about this more prominent. Thanks to David Klein from [TU Braunschweig](https://www.tu-braunschweig.de/en/ias) (@leeN) for reporting this issue. ## 6.0.0 (2021-08-03) ### Potentially Breaking Changes * Ruby 2.5.0 is now the oldest officially supported Ruby version. * Sanitize now requires Nokogiri 1.12.0 or higher, which includes Nokogumbo. The separate dependency on Nokogumbo has been removed. [@lis2 - #211][211] [211]:https://github.com/rgrove/sanitize/pull/211 ## 5.2.3 (2021-01-11) ### Bug Fixes * Ensure protocol sanitization is applied to data attributes. [@ccutrer - #207][207] [207]:https://github.com/rgrove/sanitize/pull/207 ## 5.2.2 (2021-01-06) ### Bug Fixes * Fixed a deprecation warning in Ruby 2.7+ when using keyword arguments in a custom transformer. [@mscrivo - #206][206] [206]:https://github.com/rgrove/sanitize/pull/206 ## 5.2.1 (2020-06-16) ### Bug Fixes * Fixed an HTML sanitization bypass that could allow XSS. This issue affects Sanitize versions 3.0.0 through 5.2.0. When HTML was sanitized using the "relaxed" config or a custom config that allows certain elements, some content in a `<math>` or `<svg>` element may not have beeen sanitized correctly even if `math` and `svg` were not in the allowlist. This could allow carefully crafted input to sneak arbitrary HTML through Sanitize, potentially enabling an XSS (cross-site scripting) attack. You are likely to be vulnerable to this issue if you use Sanitize's relaxed config or a custom config that allows one or more of the following HTML elements: - `iframe` - `math` - `noembed` - `noframes` - `noscript` - `plaintext` - `script` - `style` - `svg` - `xmp` See the security advisory for more details, including a workaround if you're not able to upgrade: [GHSA-p4x4-rw2p-8j8m] Many thanks to MichaƂ Bentkowski of Securitum for reporting this issue and helping to verify the fix. [GHSA-p4x4-rw2p-8j8m]:https://github.com/rgrove/sanitize/security/advisories/GHSA-p4x4-rw2p-8j8m ## 5.2.0 (2020-06-06) ### Changes * The term "whitelist" has been replaced with "allowlist" throughout Sanitize's source and documentation. While the etymology of "whitelist" may not be explicitly racist in origin or intent, there are inherent racial connotations in the implication that white is good and black (as in "blacklist") is not. This is a change I should have made long ago, and I apologize for not making it sooner. * In transformer input, the `:is_whitelisted` and `:node_whitelist` keys are now deprecated. New `:is_allowlisted` and `:node_allowlist` keys have been added. The old keys will continue to work in order to avoid breaking existing code, but they are no longer documented and may be removed in a future semver major release. ## 5.1.0 (2019-09-07) ### Features * Added a `:parser_options` config hash, which makes it possible to pass custom parsing options to Nokogumbo. [@austin-wang - #194][194] ### Bug Fixes * Non-characters and non-whitespace control characters are now stripped from HTML input before parsing to comply with the HTML Standard's [preprocessing guidelines][html-preprocessing]. Prior to this Sanitize had adhered to [older W3C guidelines][unicode-xml] that have since been withdrawn. [#179][179] [179]:https://github.com/rgrove/sanitize/issues/179 [194]:https://github.com/rgrove/sanitize/pull/194 [html-preprocessing]:https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream [unicode-xml]:https://www.w3.org/TR/unicode-xml/ ## 5.0.0 (2018-10-14) For most users, upgrading from 4.x shouldn't require any changes. However, the minimum required Ruby version has changed, and Sanitize 5.x's HTML output may differ in some small ways from 4.x's output. If this matters to you, please review the changes below carefully. ### Potentially Breaking Changes * Ruby 2.3.0 is now the oldest officially supported Ruby version. Sanitize may work in older 2.x Rubies, but they aren't actively tested. Sanitize definitely no longer works in Ruby 1.9.x. * Upgraded to Nokogumbo 2.x, which fixes various bugs and adds standard-compliant HTML serialization. [@stevecheckoway - #189][189] * Children of the following elements are now removed by default when these elements are removed, rather than being preserved and escaped: - `iframe` - `noembed` - `noframes` - `noscript` - `script` - `style` * Children of allowlisted `iframe` elements are now always removed. In modern HTML, `iframe` elements should never have children. In HTML 4 and earlier `iframe` elements were allowed to contain fallback content for legacy browsers, but it's been almost two decades since that was useful. * Fixed a bug that caused `:remove_contents` to behave as if it were set to `true` when it was actually an Array. [189]:https://github.com/rgrove/sanitize/pull/189 ## 4.6.6 (2018-07-23) * Improved performance and memory usage by optimizing `Sanitize#transform_node!` [@stanhu - #183][183] [183]:https://github.com/rgrove/sanitize/pull/183 ## 4.6.5 (2018-05-16) * Improved performance slightly by tweaking the order of built-in transformers. [@rafbm - #180][180] [180]:https://github.com/rgrove/sanitize/pull/180 ## 4.6.4 (2018-03-20) * Fixed: A change introduced in 4.6.2 broke certain transformers that relied on being able to mutate the name of an HTML node. That change has been reverted and a test has been added to cover this case. [@zetter - #177][177] [177]:https://github.com/rgrove/sanitize/issues/177 ## 4.6.3 (2018-03-19) * [CVE-2018-3740][176]: Fixed an HTML injection vulnerability that could allow XSS. When Sanitize <= 4.6.2 is used in combination with libxml2 >= 2.9.2, a specially crafted HTML fragment can cause libxml2 to generate improperly escaped output, allowing non-allowlisted attributes to be used on allowlisted elements. Sanitize now performs additional escaping on affected attributes to prevent this. Many thanks to the Shopify Application Security Team for responsibly reporting this issue. ## 4.6.2 (2018-03-19) * Reduced string allocations to optimize memory usage. [@janklimo - #175][175] [175]:https://github.com/rgrove/sanitize/pull/175 ## 4.6.1 (2018-03-15) * Added support for frozen string literals in Ruby 2.4+. [@flavorjones - #174][174] [174]:https://github.com/rgrove/sanitize/pull/174 ## 4.6.0 (2018-01-29) * Loosened the Nokogumbo dependency to allow installing semver-compatible versions greater than or equal to v1.4. [@rafbm - #171][171] [171]:https://github.com/rgrove/sanitize/pull/171 ## 4.5.0 (2017-06-04) * Added SVG-related CSS properties to the relaxed config. See [the diff][161] for the full list of added properties. [@louim - #161][161] * Fixed: Sanitize now strips null bytes (`\u0000`) before passing input to Nokogumbo, since they can cause recent versions to crash with a failed assertion in the Gumbo parser. [161]:https://github.com/rgrove/sanitize/pull/161 ## 4.4.0 (2016-09-29) * Added `srcset` to the attribute allowlist for `img` elements in the relaxed config. [@ejtttje - #156][156] [156]:https://github.com/rgrove/sanitize/pull/156 ## 4.3.0 (2016-09-20) * Methods can now be used as transformers. [@Skipants - #155][155] [155]:https://github.com/rgrove/sanitize/pull/155 ## 4.2.0 (2016-08-22) * Added `-webkit-font-smoothing` to the relaxed CSS config. [@louim - #154][154] * Fixed: Nokogumbo >=1.4.9 changed its behavior in a way that allowed invalid doctypes (like `<!DOCTYPE nonsense>`) when the `:allow_doctype` config setting was `true`. Invalid doctypes are now coerced to valid ones as they were prior to this Nokogumbo change. [154]:https://github.com/rgrove/sanitize/pull/154 ## 4.1.0 (2016-06-17) * Added a new CSS config setting, `:import_url_validator`. This is a Proc or other callable object that will be called with each `@import` URL, and should return `true` to allow the URL or `false` to remove it. [@nikz - #153][153] [153]:https://github.com/rgrove/sanitize/pull/153/ ## 4.0.1 (2015-12-09) * Unpinned the Nokogumbo dependency. [@rubys - #141][141] [141]:https://github.com/rgrove/sanitize/pull/141 ## 4.0.0 (2015-04-20) ### Potentially breaking changes * Added two new CSS config settings, `:at_rules_with_properties` and `:at_rules_with_styles`. These allow you to define which at-rules should be allowed to contain properties and which should be allowed to contain style rules. Previously this was hard-coded internally. [#111][111] The previous `:at_rules` setting still exists, and defines at-rules that may not have associated blocks, such as `@import`. If you have a custom config that contains an `:at_rules` setting, you may need to move rules can have blocks to either `:at_rules_with_properties` or `:at_rules_with_styles`. See Sanitize's relaxed config for an example. ### Other changes * Added full support for CSS `@page` rules in the relaxed config, including support for all page-margin box rules (such as `@top-left`, `@bottom-center`, etc.) * Added the following CSS at-rules to the relaxed config: - `@-moz-keyframes` - `@-o-keyframes` - `@-webkit-keyframes` - `@document` * Added a whole bunch of CSS properties to the relaxed config. View the complete list [here](https://gist.github.com/rgrove/044cc7e9a5b44f583c05). * Small performance improvements. * Fixed: Upgraded Crass to 1.0.2 to pick up a fix that affected the parsing of CSS `@page` rules. [111]:https://github.com/rgrove/sanitize/issues/111 ## 3.1.2 (2015-02-22) * Fixed: Deleting a node in a custom transformer could trigger a memory leak in Nokogiri if that node's children were later reparented, which the built-in CleanElement transformer did by default. The CleanElement transformer is now careful not to reparent the children of deleted nodes. [#129][129] [129]:https://github.com/rgrove/sanitize/issues/129 ## 3.1.1 (2015-02-04) * Fixed: `#document` and `#fragment` failed on frozen strings, and could unintentionally modify unfrozen strings if they used an encoding other than UTF-8 or if they contained characters not allowed in HTML. [@AnchorCat - #128][128] [128]:https://github.com/rgrove/sanitize/pull/128 ## 3.1.0 (2014-12-22) * Added the following CSS properties to the relaxed config. [@ehudc - #120][120] - `-moz-text-size-adjust` - `-ms-text-size-adjust` - `-webkit-text-size-adjust` - `text-size-adjust` * Updated Nokogumbo to 1.2.0 to pick up a fix for a Gumbo bug where the entity `&AElig;` left its semicolon behind when it was converted to a character during parsing. [#119][119] [119]:https://github.com/rgrove/sanitize/issues/119 [120]:https://github.com/rgrove/sanitize/pull/120 ## 3.0.4 (2014-12-12) * Fixed: Harmless whitespace preceding a URL protocol (such as " http://") caused the URL to be removed even when the protocol was allowlisted. [@benubois - #126][126] [126]:https://github.com/rgrove/sanitize/pull/126 ## 3.0.3 (2014-10-29) * Fixed: Some CSS selectors weren't parsed correctly inside the body of a `@media` block, causing them to be removed even when allowlist rules should have allowed them to remain. [#121][121] [121]:https://github.com/rgrove/sanitize/issues/121 ## 3.0.2 (2014-09-02) * Updated Nokogumbo to 1.1.12, because 1.1.11 silently reverted the change we were trying to pick up in the last release. Now issue [#114][114] is _actually_ fixed. ## 3.0.1 (2014-09-02) * Updated Nokogumbo to 1.1.11 to pick up a fix for a Gumbo bug in which certain HTML character entities, such as `&Ouml;`, were parsed incorrectly, leaving the semicolon behind in the output. [#114][114] [114]:https://github.com/rgrove/sanitize/issues/114 ## 3.0.0 (2014-06-21) As of this version, Sanitize adheres strictly to the [SemVer 2.0.0][semver] versioning standard. This release contains API and output changes that are incompatible with previous releases, as indicated by the major version increment. [semver]:http://semver.org/ ### Backwards-incompatible changes * HTML is now parsed using Google's Gumbo HTML5 parser, which adheres to the HTML5 parsing spec and behaves much more like modern browser parsers than the previous libxml2-based parser. As a result, HTML output may differ from that of previous versions of Sanitize. * All transformers now traverse the document from the top down, starting with the first node, then its first child, and so on. The `:transformers_breadth` config has been removed, and old bottom-up transformers (the previous default) may need to be rewritten. * Sanitize's built-in configs are now deeply frozen to prevent people from modifying them (either accidentally or maliciously). To customize a built-in config, create a new copy using `Sanitize::Config.merge()`, like so: ```ruby Sanitize.fragment(html, Sanitize::Config.merge(Sanitize::Config::BASIC, :elements => Sanitize::Config::BASIC[:elements] + ['div', 'table'], :remove_contents => true )) ``` * The `clean!` and `clean_document!` methods were removed, since they weren't useful and tended to confuse people. * The `clean` method was renamed to `fragment` to more clearly indicate that its intended use is to sanitize an HTML fragment. * The `clean_document` method was renamed to `document`. * The `clean_node!` method was renamed to `node!`. * The `document` method now raises a `Sanitize::Error` if the `<html>` element isn't allowlisted, rather than a `RuntimeError`. This error is also now raised regardless of the `:remove_contents` config setting. * The `:output` config has been removed. Output is now always HTML, not XHTML. * The `:output_encoding` config has been removed. Output is now always UTF-8. ### Other changes * Added advanced CSS sanitization support using [Crass][crass], which is fully compliant with the CSS Syntax Module Level 3 parsing spec. The contents of allowlisted `<style>` elements and `style` attributes in HTML will be sanitized as CSS, or you can use the `Sanitize::CSS` class to manually sanitize CSS stylesheets or properties. * Added an `:allow_doctype` setting. When `true`, well-formed doctype definitions will be allowed in documents. When `false` (the default), doctype definitions will be removed from documents. Doctype definitions are never allowed in fragments, regardless of this setting. * Added the following elements to the relaxed config, in addition to various attributes: `article`, `aside`, `body`, `data`, `div`, `footer`, `head`, `header`, `html`, `main`, `nav`, `section`, `span`, `style`, `title`. * The `:whitespace_elements` config is now a Hash, and allows you to specify the text that should be inserted before and after these elements when they're removed. The old-style Array-based config value is still supported for backwards compatibility. [@alperkokmen - #94][94] * Unsuitable Unicode characters are now removed from HTML before it's parsed. [#106][106] * Fixed: Non-tag brackets in input like `"1 > 2 and 2 < 1"` are now parsed and escaped correctly in accordance with the HTML5 spec, becoming `"1 &gt; 2 and 2 &lt; 1"`. [#83][83] * Fixed: Siblings added after the current node during traversal are now also traversed. In previous versions they were simply skipped. [#91][91] * Fixed: Nokogiri has been smacked and instructed to stop adding newlines after certain elements, because if people wanted newlines there they'd have put them there, dammit. [#103][103] * Fixed: Added a workaround for a libxml2 bug that caused an undesired content-type meta tag to be added to all documents with `<head>` elements. [Nokogiri #1008][n1008] [crass]:https://github.com/rgrove/crass [83]:https://github.com/rgrove/sanitize/issues/83 [91]:https://github.com/rgrove/sanitize/issues/91 [94]:https://github.com/rgrove/sanitize/pull/94/ [103]:https://github.com/rgrove/sanitize/issues/103 [106]:https://github.com/rgrove/sanitize/issues/106 [n1008]:https://github.com/sparklemotion/nokogiri/issues/1008 ## 2.1.1 (2018-09-30) * [CVE-2018-3740][176]: Fixed an HTML injection vulnerability that could allow XSS (backported from Sanitize 4.6.3). [@dometto - #188][188] When Sanitize <= 2.1.0 is used in combination with libxml2 >= 2.9.2, a specially crafted HTML fragment can cause libxml2 to generate improperly escaped output, allowing non-allowlisted attributes to be used on allowlisted elements. Sanitize now performs additional escaping on affected attributes to prevent this. Many thanks to the Shopify Application Security Team for responsibly reporting this issue. [176]:https://github.com/rgrove/sanitize/issues/176 [188]:https://github.com/rgrove/sanitize/pull/188 ## 2.1.0 (2014-01-13) * Added support for allowlisting arbitrary HTML5 `data-*` attributes. Use the symbol `:data` instead of an attribute name in the `:attributes` config to indicate that arbitrary data attributes should be allowed on an element. * Added the following elements to the relaxed config: `address`, `bdi`, `hr`, and `summary`. * Fixed: A colon (`:`) character in a URL fragment identifier such as `#foo:1` was incorrectly treated as a protocol delimiter. [@heathd - #87][87] [87]:https://github.com/rgrove/sanitize/pull/87 ## 2.0.6 (2013-07-10) * Fixed: Version 2.0.5 inadvertently included some work-in-progress changes that shouldn't have made their way into the master branch. This is what happens when I release before coffee instead of after. ## 2.0.5 (2013-07-10) * Loosened the Nokogiri dependency back to >= 1.4.4 to allow Sanitize to coexist in newer Rubies with other libraries that restrict Nokogiri to 1.5.x for 1.8.7 compatibility. Sanitize still no longer supports 1.8.7, but this should make life easier for people who need those other libs. ## 2.0.4 (2013-06-12) * Added `Sanitize.clean_document`, which sanitizes a full HTML document rather than just a fragment. \[Ben Anderson] * Nokogiri dependency bumped to 1.6.x. * Dropped support for Ruby versions older than 1.9.2. ## 2.0.3 (2011-07-01) * Loosened the Nokogiri dependency to allow Nokogiri 1.5.x. ## 2.0.2 (2011-05-21) * Fixed a bug in which a protocol like "java\script:" would be translated to "java%5Cscript:" and allowed through the filter when relative URLs were enabled. This didn't actually allow malicious code to run, but it is undesired behavior. ## 2.0.1 (2011-03-16) * Updated the protocol regex to anchor at the beginning of the string rather than the beginning of a line. \[Eaden McKee] ## 2.0.0 (2011-01-15) * The environment data passed into transformers and the return values expected from transformers have changed. Old transformers will need to be updated. See the README for details. * Transformers now receive nodes of all types, not just element nodes. * Sanitize's own core filtering logic is now implemented as a set of always-on transformers. * The default value for the `:output` config is now `:html`. Previously it was `:xhtml`. * Added a `:whitespace_elements` config, which specifies elements (such as `<br>` and `<p>`) that should be replaced with whitespace when removed in order to preserve readability. See the README for the default list of elements that will be replaced with whitespace when removed. * Added a `:transformers_breadth` config, which may be used to specify transformers that should traverse nodes in a breadth-first mode rather than the default depth-first mode. * Added the `abbr`, `dfn`, `kbd`, `mark`, `s`, `samp`, `time`, and `var` elements to the allowlists for the basic and relaxed configs. * Added the `bdo`, `del`, `figcaption`, `figure`, `hgroup`, `ins`, `rp`, `rt`, `ruby`, and `wbr` elements to the allowlist for the relaxed config. * The `dir`, `lang`, and `title` attributes are now allowlisted for all elements in the relaxed config. * Bumped minimum Nokogiri version to 1.4.4 to avoid a bug in 1.4.2+ (issue #315) that caused `</body></html>` to be appended to the CDATA inside unterminated script and style elements. ## 1.2.1 (2010-04-20) * Added a `:remove_contents` config setting. If set to `true`, Sanitize will remove the contents of all non-allowlisted elements in addition to the elements themselves. If set to an array of element names, Sanitize will remove the contents of only those elements (when filtered), and leave the contents of other filtered elements. \[Thanks to Rafael Souza for the array option] * Added an `:output_encoding` config setting to allow the character encoding for HTML output to be specified. The default is utf-8. * The environment hash passed into transformers now includes a `:node_name` item containing the lowercase name of the current HTML node (e.g. "div"). * Returning anything other than a Hash or nil from a transformer will now raise a meaningful `Sanitize::Error` exception rather than an unintended `NameError`. ## 1.2.0 (2010-01-17) * Requires Nokogiri ~> 1.4.1. * Added support for transformers, which allow you to filter and alter nodes using your own custom logic, on top of (or instead of) Sanitize's core filter. See the README for details and examples. * Added `Sanitize.clean_node!`, which sanitizes a `Nokogiri::XML::Node` and all its children. * Added elements `<h1>` through `<h6>` to the Relaxed allowlist. \[Suggested by David Reese] ## 1.1.0 (2009-10-11) * Migrated from Hpricot to Nokogiri. Requires libxml2 >= 2.7.2 \[Adam Hooper] * Added an `:output` config setting to allow the output format to be specified. Supported formats are `:xhtml` (the default) and `:html` (which outputs HTML4). * Changed protocol regex to ensure Sanitize doesn't kill URLs with colons in path segments. \[Peter Cooper] ## 1.0.8 (2009-04-23) * Added a workaround for an Hpricot bug that prevents attribute names from being downcased in recent versions of Hpricot. This was exploitable to prevent non-allowlisted protocols from being cleaned. \[Reported by Ben Wanicur] ## 1.0.7 (2009-04-11) * Requires Hpricot 0.8.1+, which is finally compatible with Ruby 1.9.1. * Fixed a bug that caused named character entities containing digits (like `&sup2;`) to be escaped when they shouldn't have been. \[Reported by Sebastian Steinmetz] ## 1.0.6 (2009-02-23) * Removed htmlentities gem dependency. * Existing well-formed character entity references in the input string are now preserved rather than being decoded and re-encoded. * The `'` character is now encoded as `&#39;` instead of `&apos;` to prevent problems in IE6. * You can now specify the symbol `:all` in place of an element name in the attributes config hash to allow certain attributes on all elements. \[Thanks to Mutwin Kraus] ## 1.0.5 (2009-02-05) * Fixed a bug introduced in version 1.0.3 that prevented non-allowlisted protocols from being cleaned when relative URLs were allowed. \[Reported by Dev Purkayastha] * Fixed "undefined method `parent='" exceptions caused by parser changes in edge Hpricot. ## 1.0.4 (2009-01-16) * Fixed a bug that made it possible to sneak a non-allowlisted element through by repeating it several times in a row. All versions of Sanitize prior to 1.0.4 are vulnerable. \[Reported by Cristobal] ## 1.0.3 (2009-01-15) * Fixed a bug whereby incomplete Unicode or hex entities could be used to prevent non-allowlisted protocols from being cleaned. Since IE6 and Opera still decode the incomplete entities, users of those browsers may be vulnerable to malicious script injection on websites using versions of Sanitize prior to 1.0.3. ## 1.0.2 (2009-01-04) * Fixed a bug that caused an exception to be thrown when parsing a valueless attribute that's expected to contain a URL. ## 1.0.1 (2009-01-01) * You can now specify `:relative` in a protocol config array to allow attributes containing relative URLs with no protocol. The Basic and Relaxed configs have been updated to allow relative URLs. * Added a workaround for an Hpricot bug that causes HTML entities for non-ASCII characters to be replaced by question marks, and all other entities to be destructively decoded. ## 1.0.0 (2008-12-25) * First release. ���������������������������������������������������������������������������������������������������������������������������������������������������������������sanitize-7.0.0/sanitize.gemspec���������������������������������������������������������������������0000644�0000041�0000041�00000005154�14744072357�016611� 0����������������������������������������������������������������������������������������������������ustar �www-data������������������������www-data���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������######################################################### # This file has been automatically generated by gem2tgz # ######################################################### # -*- encoding: utf-8 -*- # stub: sanitize 7.0.0 ruby lib Gem::Specification.new do |s| s.name = "sanitize".freeze s.version = "7.0.0" s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version= s.metadata = { "changelog_uri" => "https://github.com/rgrove/sanitize/blob/main/CHANGELOG.md", "documentation_uri" => "https://rubydoc.info/github/rgrove/sanitize", "rubygems_mfa_required" => "true" } if s.respond_to? :metadata= s.require_paths = ["lib".freeze] s.authors = ["Ryan Grove".freeze] s.date = "2024-12-30" s.description = "Sanitize is an allowlist-based HTML and CSS sanitizer. It removes all HTML\nand/or CSS from a string except the elements, attributes, and properties you\nchoose to allow.'\n".freeze s.email = "ryan@wonko.com".freeze s.files = ["CHANGELOG.md".freeze, "LICENSE".freeze, "README.md".freeze, "lib/sanitize.rb".freeze, "lib/sanitize/config.rb".freeze, "lib/sanitize/config/basic.rb".freeze, "lib/sanitize/config/default.rb".freeze, "lib/sanitize/config/relaxed.rb".freeze, "lib/sanitize/config/restricted.rb".freeze, "lib/sanitize/css.rb".freeze, "lib/sanitize/transformers/clean_cdata.rb".freeze, "lib/sanitize/transformers/clean_comment.rb".freeze, "lib/sanitize/transformers/clean_css.rb".freeze, "lib/sanitize/transformers/clean_doctype.rb".freeze, "lib/sanitize/transformers/clean_element.rb".freeze, "lib/sanitize/version.rb".freeze, "test/common.rb".freeze, "test/test_clean_comment.rb".freeze, "test/test_clean_css.rb".freeze, "test/test_clean_doctype.rb".freeze, "test/test_clean_element.rb".freeze, "test/test_config.rb".freeze, "test/test_malicious_css.rb".freeze, "test/test_malicious_html.rb".freeze, "test/test_parser.rb".freeze, "test/test_sanitize.rb".freeze, "test/test_sanitize_css.rb".freeze, "test/test_transformers.rb".freeze] s.homepage = "https://github.com/rgrove/sanitize/".freeze s.licenses = ["MIT".freeze] s.required_ruby_version = Gem::Requirement.new(">= 3.1.0".freeze) s.rubygems_version = "3.3.15".freeze s.summary = "Allowlist-based HTML and CSS sanitizer.".freeze if s.respond_to? :specification_version then s.specification_version = 4 end if s.respond_to? :add_runtime_dependency then s.add_runtime_dependency(%q<crass>.freeze, ["~> 1.0.2"]) s.add_runtime_dependency(%q<nokogiri>.freeze, [">= 1.16.8"]) else s.add_dependency(%q<crass>.freeze, ["~> 1.0.2"]) s.add_dependency(%q<nokogiri>.freeze, [">= 1.16.8"]) end end ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������