truncato-0.7.12/0000755000004100000410000000000014263137635013502 5ustar www-datawww-datatruncato-0.7.12/README.md0000644000004100000410000000472314263137635014767 0ustar www-datawww-data# truncato *truncato* is a Ruby library for truncating HTML strings keeping the markup valid. ## Installing In your `Gemfile` ```ruby gem 'truncato' ``` ## Usage ```ruby Truncato.truncate "

some text

", max_length: 4 #=> "

s...

" Truncato.truncate "

some text

", max_length: 4, count_tags: false #=> "

some...

" ``` The configuration options are: * `max_length`: The size, in characters, to truncate (`30` by default) * `filtered_attributes`: Array of attribute names that will be removed in the truncated string. This allows you to make the truncated string shorter by excluding the content of attributes you can discard in some given context, e.g HTML `style` attribute. * `filtered_tags`: Array of tags that will be removed in the truncated string. If a tag is excluded, all the nested tags under it will be excluded too. * `count_tags`: Boolean value indicating whether tags size should be considered when truncating (`true` by default) * `tail_before_final_tag`: Boolean value indicating whether to apply a tail before the final closing tag (`false` by default) * `comments`: Boolean value indicating whether to include comments in parsed results (`false` by default) * `tail`: The string to append when the truncation occurs ('...' by default) * `count_tail`: Boolean value indicating whether to include the tail within the bounds of the provided max length (`false` by default) ## Performance Truncato was designed with performance in mind. Its main motivation was that existing libs couldn't truncate a multiple-MB document into a few-KB one in a reasonable time. It uses the [Nokogiri](http://nokogiri.org/) SAX parser. There is a benchmark included that generates a synthetic XML of 4MB and truncates it to 400 KB. You can run the benchmark using ```ruby rake truncato:benchmark ``` There is a also a comparison benchmark that tests the previous data with other alternatives ```ruby rake truncato:vendor_compare ``` The results comparing truncato with other libs:
Truncato truncate_html HTML Truncator peppercorn
Time for truncating a 4MB XML document to 4KB 1.5 s 20 s 220 s 232 s
## Running the tests ```ruby rake spec ``` truncato-0.7.12/Rakefile0000644000004100000410000000110214263137635015141 0ustar www-datawww-databegin require 'bundler/setup' rescue LoadError puts 'You must `gem install bundler` and `bundle install` to run rake tasks' end require 'rdoc/task' RDoc::Task.new(:rdoc) do |rdoc| rdoc.rdoc_dir = 'rdoc' rdoc.title = 'MailgunRails' rdoc.options << '--line-numbers' rdoc.rdoc_files.include('README.rdoc') rdoc.rdoc_files.include('lib/**/*.rb') end Bundler::GemHelper.install_tasks require 'rake/testtask' Rake::TestTask.new(:test) do |t| t.libs << 'lib' t.libs << 'test' t.pattern = 'test/**/*_test.rb' t.verbose = false end task default: :test truncato-0.7.12/lib/0000755000004100000410000000000014263137635014250 5ustar www-datawww-datatruncato-0.7.12/lib/truncato/0000755000004100000410000000000014263137635016107 5ustar www-datawww-datatruncato-0.7.12/lib/truncato/truncated_sax_document.rb0000644000004100000410000001211414263137635023175 0ustar www-datawww-datarequire 'nokogiri' require 'htmlentities' class TruncatedSaxDocument < Nokogiri::XML::SAX::Document IGNORABLE_TAGS = %w(html head body) SINGLE_TAGS = %w{br img} attr_reader :truncated_string, :max_length, :max_length_reached, :tail, :count_tags, :filtered_attributes, :filtered_tags, :ignored_levels def initialize(options) @html_coder = HTMLEntities.new capture_options options init_parsing_state end def start_element name, attributes enter_ignored_level if filtered_tags.include?(name) return if @max_length_reached || ignorable_tag?(name) || ignore_mode? @closing_tags.push name unless single_tag_element? name append_to_truncated_string opening_tag(name, attributes), overriden_tag_length end def characters decoded_string return if @max_length_reached || ignore_mode? remaining_length = max_length - @estimated_length - 1 string_to_append = decoded_string.length > remaining_length ? truncate_string(decoded_string, remaining_length) : decoded_string append_to_truncated_string @html_coder.encode(string_to_append), string_to_append.length end def comment string if @comments return if @max_length_reached process_comment string end end def end_element name if filtered_tags.include?(name) && ignore_mode? exit_ignored_level return end return if @max_length_reached || ignorable_tag?(name) || ignore_mode? unless single_tag_element? name @closing_tags.pop append_to_truncated_string closing_tag(name), overriden_tag_length end end def end_document close_truncated_document if max_length_reached end private def capture_options(options) @max_length = options[:max_length] @count_tags = options [:count_tags] @count_tail = options.fetch(:count_tail, false) @tail = options[:tail] @filtered_attributes = options[:filtered_attributes] || [] @filtered_tags = options[:filtered_tags] || [] @tail_before_final_tag = options.fetch(:tail_before_final_tag, false) @comments = options.fetch(:comments, false) end def process_comment(string) remaining_length = max_length - @estimated_length - 1 string_to_append = comment_tag(string).length > remaining_length ? truncate_comment(comment_tag(string), remaining_length) : comment_tag(string) append_to_truncated_string string_to_append end def comment_tag comment "" end def init_parsing_state @truncated_string = "" @closing_tags = [] @estimated_length = @count_tail ? tail_length : 0 @max_length_reached = false @ignored_levels = 0 end def tail_length tail.match(/^&\w+;$/).nil? ? tail.length : 1 end def single_tag_element? name SINGLE_TAGS.include? name end def append_to_truncated_string string, overriden_length=nil @truncated_string << string increase_estimated_length(overriden_length || string.length) end def opening_tag name, attributes attributes_string = attributes_to_string attributes if single_tag_element? name "<#{name}#{attributes_string}/>" else "<#{name}#{attributes_string}>" end end def attributes_to_string attributes return "" if attributes.empty? attributes_string = concatenate_attributes_declaration attributes attributes_string.rstrip end def concatenate_attributes_declaration attributes attributes.inject(' ') do |string, attribute| key, value = attribute next string if @filtered_attributes.include? key string << "#{key}='#{@html_coder.encode value}' " end end def closing_tag name "" end def increase_estimated_length amount @estimated_length += amount check_max_length_reached end def check_max_length_reached @max_length_reached = true if @estimated_length >= max_length end def truncate_string string, remaining_length if @tail_before_final_tag string[0..remaining_length] else @tail_appended = true "#{string[0..remaining_length]}#{tail}" end end def truncate_comment string, remaining_length if @tail_before_final_tag string[0..remaining_length] else @tail_appended = true "#{string[0..remaining_length]}#{tail}-->" end end def close_truncated_document append_tail_between_closing_tags if @tail_before_final_tag append_to_truncated_string tail unless @tail_appended append_closing_tags end def append_closing_tags @closing_tags.reverse.each { |name| append_to_truncated_string closing_tag name } end def overriden_tag_length @count_tags ? nil : 0 end def ignorable_tag?(name) artificial_root_name?(name) || IGNORABLE_TAGS.include?(name.downcase) end def artificial_root_name? name name == Truncato::ARTIFICIAL_ROOT_NAME end def append_tail_between_closing_tags append_to_truncated_string closing_tag(@closing_tags.delete_at (@closing_tags.length - 1)) if @closing_tags.length > 1 end def enter_ignored_level @ignored_levels += 1 end def exit_ignored_level @ignored_levels -= 1 end def ignore_mode? @ignored_levels > 0 end end truncato-0.7.12/lib/truncato/version.rb0000644000004100000410000000004714263137635020122 0ustar www-datawww-datamodule Truncato VERSION='0.7.12' end truncato-0.7.12/lib/truncato/truncato.rb0000644000004100000410000000412514263137635020275 0ustar www-datawww-datamodule Truncato DEFAULT_OPTIONS = { max_length: 30, count_tags: true, tail: "...", filtered_attributes: [] } ARTIFICIAL_ROOT_NAME = 'truncato-artificial-root' # Truncates the source XML string and returns the truncated XML. It will keep a valid XML structure # and insert a _tail_ text indicating the position where content were removed (...). # # @param [String] source the XML source to truncate # @param [Hash] user_options truncation options # @option user_options [Integer] :max_length Maximum length # @option user_options [String] :tail text to append when the truncation happens # @option user_options [Boolean] :count_tags `true` for counting tags for truncation, `false` for not counting them # @option user_options [Array] :filtered_attributes Array of names of attributes that should be excluded in the resulting truncated string. This allows you to make the truncated string shorter by excluding the content of attributes you can discard in some given context, e.g HTML `style` attribute. # @return [String] the truncated string def self.truncate source, user_options={} options = DEFAULT_OPTIONS.merge(user_options) self.truncate_html(source, options) || self.truncate_no_html(source, options) end private def self.truncate_html source, options self.do_truncate_html(source, options) ? self.do_truncate_html(with_artificial_root(source), options) : nil end def self.do_truncate_html source, options truncated_sax_document = TruncatedSaxDocument.new(options) parser = Nokogiri::HTML::SAX::Parser.new(truncated_sax_document) parser.parse(source) { |context| context.replace_entities = false } truncated_string = truncated_sax_document.truncated_string truncated_string.empty? ? nil : truncated_string end def self.with_artificial_root(source) "<#{ARTIFICIAL_ROOT_NAME}>#{source}" end def self.truncate_no_html source, options max_length = options[:max_length] tail = source.length > max_length ? options[:tail] : '' "#{source[0..max_length-1]}#{tail}" end end truncato-0.7.12/lib/truncato.rb0000644000004100000410000000012614263137635016433 0ustar www-datawww-dataDir[File.dirname(__FILE__) + '/truncato/**/*.rb'].each do |file| require file end truncato-0.7.12/truncato.gemspec0000644000004100000410000000355114263137635016712 0ustar www-datawww-data######################################################### # This file has been automatically generated by gem2tgz # ######################################################### # -*- encoding: utf-8 -*- # stub: truncato 0.7.12 ruby lib Gem::Specification.new do |s| s.name = "truncato".freeze s.version = "0.7.12" s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version= s.metadata = { "allowed_push_host" => "https://rubygems.org" } if s.respond_to? :metadata= s.require_paths = ["lib".freeze] s.authors = ["Jorge Manrubia".freeze] s.date = "2013-09-10" s.description = "Ruby tool for truncating HTML strings keeping a valid HTML markup".freeze s.email = "jorge.manrubia@gmail.com".freeze s.extra_rdoc_files = ["LICENSE.txt".freeze, "README.md".freeze] s.files = ["LICENSE.txt".freeze, "README.md".freeze, "Rakefile".freeze, "lib/truncato.rb".freeze, "lib/truncato/truncated_sax_document.rb".freeze, "lib/truncato/truncato.rb".freeze, "lib/truncato/version.rb".freeze] s.homepage = "https://github.com/jorgemanrubia/truncato".freeze s.licenses = ["MIT".freeze] s.rubygems_version = "3.2.5".freeze s.summary = "A tool for truncating HTML strings efficiently".freeze if s.respond_to? :specification_version then s.specification_version = 4 end if s.respond_to? :add_runtime_dependency then s.add_runtime_dependency(%q.freeze, ["~> 4.3.1"]) s.add_runtime_dependency(%q.freeze, [">= 1.7.0", "<= 2.0"]) s.add_development_dependency(%q.freeze, [">= 0"]) s.add_development_dependency(%q.freeze, ["~> 2.14.1"]) else s.add_dependency(%q.freeze, ["~> 4.3.1"]) s.add_dependency(%q.freeze, [">= 1.7.0", "<= 2.0"]) s.add_dependency(%q.freeze, [">= 0"]) s.add_dependency(%q.freeze, ["~> 2.14.1"]) end end truncato-0.7.12/LICENSE.txt0000644000004100000410000000204214263137635015323 0ustar www-datawww-dataCopyright (c) 2011 Jorge Manrubia Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.