pax_global_header00006660000000000000000000000064122407660030014512gustar00rootroot0000000000000052 comment=b35a48ae9a40ce18f9c3e15c6eae1fd5f5e3b222 twitter-text-rb-1.7.0/000077500000000000000000000000001224076600300146045ustar00rootroot00000000000000twitter-text-rb-1.7.0/.gemtest000066400000000000000000000000001224076600300162430ustar00rootroot00000000000000twitter-text-rb-1.7.0/.gitignore000066400000000000000000000005631224076600300166000ustar00rootroot00000000000000*.gem *.rbc *.sw[a-p] *.tmproj *.tmproject *.un~ *~ .DS_Store .Spotlight-V100 .Trashes ._* .bundle .config .directory .elc .emacs.desktop .emacs.desktop.lock .redcar .yardoc Desktop.ini Gemfile.lock Icon? InstalledFiles Session.vim Thumbs.db \#*\# _yardoc auto-save-list coverage doc lib/bundler/man pkg pkg/* rdoc spec/reports test/tmp test/version_tmp tmp tmtags tramp twitter-text-rb-1.7.0/.gitmodules000066400000000000000000000002171224076600300167610ustar00rootroot00000000000000[submodule "test/twitter-text-conformance"] path = test/twitter-text-conformance url = git://github.com/twitter/twitter-text-conformance.git twitter-text-rb-1.7.0/.rspec000066400000000000000000000000301224076600300157120ustar00rootroot00000000000000--color --format=nested twitter-text-rb-1.7.0/.travis.yml000066400000000000000000000000621224076600300167130ustar00rootroot00000000000000language: ruby rvm: - 1.8.7 - 1.9.3 - 2.0.0 twitter-text-rb-1.7.0/Gemfile000066400000000000000000000001371224076600300161000ustar00rootroot00000000000000source "http://rubygems.org" # Specify the gem's dependencies in twitter-text.gemspec gemspec twitter-text-rb-1.7.0/LICENSE000066400000000000000000000236101224076600300156130ustar00rootroot00000000000000Copyright 2011 Twitter, Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this work except in compliance with the License. You may obtain a copy of the License below, or at: http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. twitter-text-rb-1.7.0/README.rdoc000066400000000000000000000071201224076600300164120ustar00rootroot00000000000000== twitter-text {}[http://travis-ci.org/twitter/twitter-text-rb] {}[https://codeclimate.com/github/twitter/twitter-text-rb] A gem that provides text processing routines for Twitter Tweets. The major reason for this is to unify the various auto-linking and extraction of usernames, lists, hashtags and URLs. == Extraction Examples # Extraction class MyClass include Twitter::Extractor usernames = extract_mentioned_screen_names("Mentioning @twitter and @jack") # usernames = ["twitter", "jack"] end # Extraction with a block argument class MyClass include Twitter::Extractor extract_reply_screen_name("@twitter are you hiring?").do |username| # username = "twitter" end end == Auto-linking Examples # Auto-link class MyClass include Twitter::Autolink html = auto_link("link @user, please #request") end # For Ruby on Rails you want to add this to app/helpers/application_helper.rb module ApplicationHelper include Twitter::Autolink end # Now the auto_link function is available in every view. So in index.html.erb: <%= auto_link("link @user, please #request") %> === Usernames Username extraction and linking matches all valid Twitter usernames but does not verify that the username is a valid Twitter account. === Lists Auto-link and extract list names when they are written in @user/list-name format. === Hashtags Auto-link and extract hashtags, where a hashtag can contain most letters or numbers but cannot be solely numbers and cannot contain punctuation. === URLs Asian languages like Chinese, Japanese or Korean may not use a delimiter such as a space to separate normal text from URLs making it difficult to identify where the URL ends and the text starts. For this reason twitter-text currently does not support extracting or auto-linking of URLs immediately followed by non-Latin characters. Example: "http://twitter.com/は素晴らしい" . The normal text is "は素晴らしい" and is not part of the URL even though it isn't space separated. === International Special care has been taken to be sure that auto-linking and extraction work in Tweets of all languages. This means that languages without spaces between words should work equally well. === Hit Highlighting Use to provide emphasis around the "hits" returned from the Search API, built to work against text that has been auto-linked already. === Conformance To run the Conformance suite, you'll need to add that project as a git submodule. From the root twitter-text-rb directory, run: git submodule add git@github.com:twitter/twitter-text-conformance.git test/twitter-text-conformance/ git submodule init git submodule update === Thanks Thanks to everybody who has filed issues, provided feedback or contributed patches. Patches courtesy of: * At Twitter … * Matt Sanford - http://github.com/mzsanford * Raffi Krikorian - http://github.com/r * Ben Cherry - http://github.com/bcherry * Patrick Ewing - http://github.com/hoverbird * Jeff Smick - http://github.com/sprsquish * Kenneth Kufluk - https://github.com/kennethkufluk * Keita Fujii - https://github.com/keitaf * Yoshimasa Niwa - https://github.com/niw * Patches from the community … * Jean-Philippe Bougie - http://github.com/jpbougie * Erik Michaels-Ober - https://github.com/sferik * Anyone who has filed an issue. It helps. Really. === Copyright and License Copyright 2011 Twitter, Inc. Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 twitter-text-rb-1.7.0/Rakefile000066400000000000000000000040751224076600300162570ustar00rootroot00000000000000require 'bundler' include Rake::DSL Bundler::GemHelper.install_tasks task :default => ['spec', 'test:conformance'] task :test => :spec require 'rspec/core/rake_task' RSpec::Core::RakeTask.new(:spec) def conformance_version(dir) require 'digest' Dir[File.join(dir, '*')].inject(Digest::SHA1.new){|digest, file| digest.update(Digest::SHA1.file(file).hexdigest) } end namespace :test do namespace :conformance do desc "Update conformance testing data" task :update do puts "Updating conformance data ... " system("git submodule init") || raise("Failed to init submodule") system("git submodule update") || raise("Failed to update submodule") puts "Updating conformance data ... DONE" end desc "Change conformance test data to the lastest version" task :latest => ['conformance:update'] do current_dir = File.dirname(__FILE__) submodule_dir = File.join(File.dirname(__FILE__), "test", "twitter-text-conformance") version_before = conformance_version(submodule_dir) system("cd #{submodule_dir} && git pull origin master") || raise("Failed to pull submodule version") system("cd #{current_dir}") if conformance_version(submodule_dir) != version_before system("cd #{current_dir} && git add #{submodule_dir}") || raise("Failed to add upgrade files") system("git commit -m \"Upgraded to the latest conformance suite\" #{submodule_dir}") || raise("Failed to commit upgraded conformacne data") puts "Upgraded conformance suite." else puts "No conformance suite changes." end end desc "Run conformance test suite" task :run do ruby '-rubygems', "test/conformance_test.rb" end end desc "Run conformance test suite" task :conformance => ['conformance:latest', 'conformance:run'] do end end require 'rdoc/task' namespace :doc do RDoc::Task.new do |rd| rd.main = "README.rdoc" rd.rdoc_dir = 'doc' rd.rdoc_files.include("README.rdoc", "lib/**/*.rb") end end desc "Run cruise control build" task :cruise => [:spec, 'test:conformance'] do end twitter-text-rb-1.7.0/lib/000077500000000000000000000000001224076600300153525ustar00rootroot00000000000000twitter-text-rb-1.7.0/lib/twitter-text.rb000066400000000000000000000006651224076600300203720ustar00rootroot00000000000000major, minor, patch = RUBY_VERSION.split('.') $RUBY_1_9 = if major.to_i == 1 && minor.to_i < 9 # Ruby 1.8 KCODE check. Not needed on 1.9 and later. raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless $KCODE[0].chr =~ /u/i false else true end %w( deprecation regex rewriter autolink extractor unicode validation hit_highlighter ).each do |name| require "twitter-text/#{name}" end twitter-text-rb-1.7.0/lib/twitter-text/000077500000000000000000000000001224076600300200365ustar00rootroot00000000000000twitter-text-rb-1.7.0/lib/twitter-text/autolink.rb000066400000000000000000000536241224076600300222230ustar00rootroot00000000000000# encoding: UTF-8 require 'set' require 'twitter-text/hash_helper' module Twitter # A module for including Tweet auto-linking in a class. The primary use of this is for helpers/views so they can auto-link # usernames, lists, hashtags and URLs. module Autolink extend self # Default CSS class for auto-linked lists DEFAULT_LIST_CLASS = "tweet-url list-slug".freeze # Default CSS class for auto-linked usernames DEFAULT_USERNAME_CLASS = "tweet-url username".freeze # Default CSS class for auto-linked hashtags DEFAULT_HASHTAG_CLASS = "tweet-url hashtag".freeze # Default CSS class for auto-linked cashtags DEFAULT_CASHTAG_CLASS = "tweet-url cashtag".freeze # Default URL base for auto-linked usernames DEFAULT_USERNAME_URL_BASE = "https://twitter.com/".freeze # Default URL base for auto-linked lists DEFAULT_LIST_URL_BASE = "https://twitter.com/".freeze # Default URL base for auto-linked hashtags DEFAULT_HASHTAG_URL_BASE = "https://twitter.com/#!/search?q=%23".freeze # Default URL base for auto-linked cashtags DEFAULT_CASHTAG_URL_BASE = "https://twitter.com/#!/search?q=%24".freeze # Default attributes for invisible span tag DEFAULT_INVISIBLE_TAG_ATTRS = "style='position:absolute;left:-9999px;'".freeze DEFAULT_OPTIONS = { :list_class => DEFAULT_LIST_CLASS, :username_class => DEFAULT_USERNAME_CLASS, :hashtag_class => DEFAULT_HASHTAG_CLASS, :cashtag_class => DEFAULT_CASHTAG_CLASS, :username_url_base => DEFAULT_USERNAME_URL_BASE, :list_url_base => DEFAULT_LIST_URL_BASE, :hashtag_url_base => DEFAULT_HASHTAG_URL_BASE, :cashtag_url_base => DEFAULT_CASHTAG_URL_BASE, :invisible_tag_attrs => DEFAULT_INVISIBLE_TAG_ATTRS }.freeze def auto_link_with_json(text, json, options = {}) # concatenate entities entities = json.values().flatten() # map JSON entity to twitter-text entity entities.each do |entity| HashHelper.symbolize_keys!(entity) # hashtag entity[:hashtag] = entity[:text] if entity[:text] end auto_link_entities(text, entities, options) end def auto_link_entities(text, entities, options = {}, &block) return text if entities.empty? # NOTE deprecate these attributes not options keys in options hash, then use html_attrs options = DEFAULT_OPTIONS.merge(options) options[:html_attrs] = extract_html_attrs_from_options!(options) options[:html_attrs][:rel] ||= "nofollow" unless options[:suppress_no_follow] Twitter::Rewriter.rewrite_entities(text, entities) do |entity, chars| if entity[:url] link_to_url(entity, chars, options, &block) elsif entity[:hashtag] link_to_hashtag(entity, chars, options, &block) elsif entity[:screen_name] link_to_screen_name(entity, chars, options, &block) elsif entity[:cashtag] link_to_cashtag(entity, chars, options, &block) end end end # Add tags around the usernames, lists, hashtags and URLs in the provided text. # The tags can be controlled with the following entries in the options hash: # Also any elements in the options hash will be converted to HTML attributes # and place in the tag. # # :url_class:: class to add to url tags # :list_class:: class to add to list tags # :username_class:: class to add to username tags # :hashtag_class:: class to add to hashtag tags # :cashtag_class:: class to add to cashtag tags # :username_url_base:: the value for href attribute on username links. The @username (minus the @) will be appended at the end of this. # :list_url_base:: the value for href attribute on list links. The @username/list (minus the @) will be appended at the end of this. # :hashtag_url_base:: the value for href attribute on hashtag links. The #hashtag (minus the #) will be appended at the end of this. # :cashtag_url_base:: the value for href attribute on cashtag links. The $cashtag (minus the $) will be appended at the end of this. # :invisible_tag_attrs:: HTML attribute to add to invisible span tags # :username_include_symbol:: place the @ symbol within username and list links # :suppress_lists:: disable auto-linking to lists # :suppress_no_follow:: do not add rel="nofollow" to auto-linked items # :symbol_tag:: tag to apply around symbol (@, #, $) in username / hashtag / cashtag links # :text_with_symbol_tag:: tag to apply around text part in username / hashtag / cashtag links # :url_target:: the value for target attribute on URL links. # :link_attribute_block:: function to modify the attributes of a link based on the entity. called with |entity, attributes| params, and should modify the attributes hash. # :link_text_block:: function to modify the text of a link based on the entity. called with |entity, text| params, and should return a modified text. def auto_link(text, options = {}, &block) auto_link_entities(text, Extractor.extract_entities_with_indices(text, :extract_url_without_protocol => false), options, &block) end # Add tags around the usernames and lists in the provided text. The # tags can be controlled with the following entries in the options hash. # Also any elements in the options hash will be converted to HTML attributes # and place in the tag. # # :list_class:: class to add to list tags # :username_class:: class to add to username tags # :username_url_base:: the value for href attribute on username links. The @username (minus the @) will be appended at the end of this. # :list_url_base:: the value for href attribute on list links. The @username/list (minus the @) will be appended at the end of this. # :username_include_symbol:: place the @ symbol within username and list links # :suppress_lists:: disable auto-linking to lists # :suppress_no_follow:: do not add rel="nofollow" to auto-linked items # :symbol_tag:: tag to apply around symbol (@, #, $) in username / hashtag / cashtag links # :text_with_symbol_tag:: tag to apply around text part in username / hashtag / cashtag links # :link_attribute_block:: function to modify the attributes of a link based on the entity. called with |entity, attributes| params, and should modify the attributes hash. # :link_text_block:: function to modify the text of a link based on the entity. called with |entity, text| params, and should return a modified text. def auto_link_usernames_or_lists(text, options = {}, &block) # :yields: list_or_username auto_link_entities(text, Extractor.extract_mentions_or_lists_with_indices(text), options, &block) end # Add tags around the hashtags in the provided text. # The tags can be controlled with the following entries in the options hash. # Also any elements in the options hash will be converted to HTML attributes # and place in the tag. # # :hashtag_class:: class to add to hashtag tags # :hashtag_url_base:: the value for href attribute. The hashtag text (minus the #) will be appended at the end of this. # :suppress_no_follow:: do not add rel="nofollow" to auto-linked items # :symbol_tag:: tag to apply around symbol (@, #, $) in username / hashtag / cashtag links # :text_with_symbol_tag:: tag to apply around text part in username / hashtag / cashtag links # :link_attribute_block:: function to modify the attributes of a link based on the entity. called with |entity, attributes| params, and should modify the attributes hash. # :link_text_block:: function to modify the text of a link based on the entity. called with |entity, text| params, and should return a modified text. def auto_link_hashtags(text, options = {}, &block) # :yields: hashtag_text auto_link_entities(text, Extractor.extract_hashtags_with_indices(text), options, &block) end # Add tags around the cashtags in the provided text. # The tags can be controlled with the following entries in the options hash. # Also any elements in the options hash will be converted to HTML attributes # and place in the tag. # # :cashtag_class:: class to add to cashtag tags # :cashtag_url_base:: the value for href attribute. The cashtag text (minus the $) will be appended at the end of this. # :suppress_no_follow:: do not add rel="nofollow" to auto-linked items # :symbol_tag:: tag to apply around symbol (@, #, $) in username / hashtag / cashtag links # :text_with_symbol_tag:: tag to apply around text part in username / hashtag / cashtag links # :link_attribute_block:: function to modify the attributes of a link based on the entity. called with |entity, attributes| params, and should modify the attributes hash. # :link_text_block:: function to modify the text of a link based on the entity. called with |entity, text| params, and should return a modified text. def auto_link_cashtags(text, options = {}, &block) # :yields: cashtag_text auto_link_entities(text, Extractor.extract_cashtags_with_indices(text), options, &block) end # Add tags around the URLs in the provided text. # The tags can be controlled with the following entries in the options hash. # Also any elements in the options hash will be converted to HTML attributes # and place in the tag. # # :url_class:: class to add to url tags # :invisible_tag_attrs:: HTML attribute to add to invisible span tags # :suppress_no_follow:: do not add rel="nofollow" to auto-linked items # :symbol_tag:: tag to apply around symbol (@, #, $) in username / hashtag / cashtag links # :text_with_symbol_tag:: tag to apply around text part in username / hashtag / cashtag links # :url_target:: the value for target attribute on URL links. # :link_attribute_block:: function to modify the attributes of a link based on the entity. called with |entity, attributes| params, and should modify the attributes hash. # :link_text_block:: function to modify the text of a link based on the entity. called with |entity, text| params, and should return a modified text. def auto_link_urls(text, options = {}, &block) auto_link_entities(text, Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false), options, &block) end # These methods are deprecated, will be removed in future. extend Deprecation # Deprecated: Please use auto_link_urls instead. # Add tags around the URLs in the provided text. # Any elements in the href_options hash will be converted to HTML attributes # and place in the tag. # Unless href_options contains :suppress_no_follow # the rel="nofollow" attribute will be added. alias :auto_link_urls_custom :auto_link_urls deprecate :auto_link_urls_custom, :auto_link_urls private HTML_ENTITIES = { '&' => '&', '>' => '>', '<' => '<', '"' => '"', "'" => ''' } def html_escape(text) text && text.to_s.gsub(/[&"'><]/) do |character| HTML_ENTITIES[character] end end # NOTE We will make this private in future. public :html_escape # Options which should not be passed as HTML attributes OPTIONS_NOT_ATTRIBUTES = Set.new([ :url_class, :list_class, :username_class, :hashtag_class, :cashtag_class, :username_url_base, :list_url_base, :hashtag_url_base, :cashtag_url_base, :username_url_block, :list_url_block, :hashtag_url_block, :cashtag_url_block, :link_url_block, :username_include_symbol, :suppress_lists, :suppress_no_follow, :url_entities, :invisible_tag_attrs, :symbol_tag, :text_with_symbol_tag, :url_target, :link_attribute_block, :link_text_block ]).freeze def extract_html_attrs_from_options!(options) html_attrs = {} options.reject! do |key, value| unless OPTIONS_NOT_ATTRIBUTES.include?(key) html_attrs[key] = value true end end html_attrs end def url_entities_hash(url_entities) (url_entities || {}).inject({}) do |entities, entity| HashHelper.symbolize_keys!(entity) entities[entity[:url]] = entity entities end end def link_to_url(entity, chars, options = {}) url = entity[:url] href = if options[:link_url_block] options[:link_url_block].call(url) else url end # NOTE auto link to urls do not use any default values and options # like url_class but use suppress_no_follow. html_attrs = options[:html_attrs].dup html_attrs[:class] = options[:url_class] if options.key?(:url_class) # add target attribute only if :url_target is specified html_attrs[:target] = options[:url_target] if options.key?(:url_target) url_entities = url_entities_hash(options[:url_entities]) # use entity from urlEntities if available url_entity = url_entities[url] || entity link_text = if url_entity[:display_url] html_attrs[:title] ||= url_entity[:expanded_url] link_url_with_entity(url_entity, options) else html_escape(url) end link_to_text(entity, link_text, href, html_attrs, options) end def link_url_with_entity(entity, options) display_url = entity[:display_url] expanded_url = entity[:expanded_url] invisible_tag_attrs = options[:invisible_tag_attrs] || DEFAULT_INVISIBLE_TAG_ATTRS # Goal: If a user copies and pastes a tweet containing t.co'ed link, the resulting paste # should contain the full original URL (expanded_url), not the display URL. # # Method: Whenever possible, we actually emit HTML that contains expanded_url, and use # font-size:0 to hide those parts that should not be displayed (because they are not part of display_url). # Elements with font-size:0 get copied even though they are not visible. # Note that display:none doesn't work here. Elements with display:none don't get copied. # # Additionally, we want to *display* ellipses, but we don't want them copied. To make this happen we # wrap the ellipses in a tco-ellipsis class and provide an onCopy handler that sets display:none on # everything with the tco-ellipsis class. # # Exception: pic.twitter.com images, for which expandedUrl = "https://twitter.com/#!/username/status/1234/photo/1 # For those URLs, display_url is not a substring of expanded_url, so we don't do anything special to render the elided parts. # For a pic.twitter.com URL, the only elided part will be the "https://", so this is fine. display_url_sans_ellipses = display_url.gsub("…", "") if expanded_url.include?(display_url_sans_ellipses) before_display_url, after_display_url = expanded_url.split(display_url_sans_ellipses, 2) preceding_ellipsis = /\A…/.match(display_url).to_s following_ellipsis = /…\z/.match(display_url).to_s # As an example: The user tweets "hi http://longdomainname.com/foo" # This gets shortened to "hi http://t.co/xyzabc", with display_url = "…nname.com/foo" # This will get rendered as: # # … # # http://longdomai # # # nname.com/foo # # #   # … # %(#{preceding_ellipsis} ) << %(#{html_escape(before_display_url)}) << %(#{html_escape(display_url_sans_ellipses)}) << %(#{html_escape(after_display_url)}) << %( #{following_ellipsis}) else html_escape(display_url) end end def link_to_hashtag(entity, chars, options = {}) hash = chars[entity[:indices].first] hashtag = entity[:hashtag] hashtag = yield(hashtag) if block_given? hashtag_class = options[:hashtag_class] if hashtag.match Twitter::Regex::REGEXEN[:rtl_chars] hashtag_class += ' rtl' end href = if options[:hashtag_url_block] options[:hashtag_url_block].call(hashtag) else "#{options[:hashtag_url_base]}#{hashtag}" end html_attrs = { :class => hashtag_class, # FIXME As our conformance test, hash in title should be half-width, # this should be bug of conformance data. :title => "##{hashtag}" }.merge(options[:html_attrs]) link_to_text_with_symbol(entity, hash, hashtag, href, html_attrs, options) end def link_to_cashtag(entity, chars, options = {}) dollar = chars[entity[:indices].first] cashtag = entity[:cashtag] cashtag = yield(cashtag) if block_given? href = if options[:cashtag_url_block] options[:cashtag_url_block].call(cashtag) else "#{options[:cashtag_url_base]}#{cashtag}" end html_attrs = { :class => "#{options[:cashtag_class]}", :title => "$#{cashtag}" }.merge(options[:html_attrs]) link_to_text_with_symbol(entity, dollar, cashtag, href, html_attrs, options) end def link_to_screen_name(entity, chars, options = {}) name = "#{entity[:screen_name]}#{entity[:list_slug]}" chunk = name chunk = yield(name) if block_given? name.downcase! at = chars[entity[:indices].first] html_attrs = options[:html_attrs].dup if entity[:list_slug] && !entity[:list_slug].empty? && !options[:suppress_lists] href = if options[:list_url_block] options[:list_url_block].call(name) else "#{options[:list_url_base]}#{name}" end html_attrs[:class] ||= "#{options[:list_class]}" else href = if options[:username_url_block] options[:username_url_block].call(chunk) else "#{options[:username_url_base]}#{name}" end html_attrs[:class] ||= "#{options[:username_class]}" end link_to_text_with_symbol(entity, at, chunk, href, html_attrs, options) end def link_to_text_with_symbol(entity, symbol, text, href, attributes = {}, options = {}) tagged_symbol = options[:symbol_tag] ? "<#{options[:symbol_tag]}>#{symbol}" : symbol text = html_escape(text) tagged_text = options[:text_with_symbol_tag] ? "<#{options[:text_with_symbol_tag]}>#{text}" : text if options[:username_include_symbol] || symbol !~ Twitter::Regex::REGEXEN[:at_signs] "#{link_to_text(entity, tagged_symbol + tagged_text, href, attributes, options)}" else "#{tagged_symbol}#{link_to_text(entity, tagged_text, href, attributes, options)}" end end def link_to_text(entity, text, href, attributes = {}, options = {}) attributes[:href] = href options[:link_attribute_block].call(entity, attributes) if options[:link_attribute_block] text = options[:link_text_block].call(entity, text) if options[:link_text_block] %(#{text}) end BOOLEAN_ATTRIBUTES = Set.new([:disabled, :readonly, :multiple, :checked]).freeze def tag_attrs(attributes) attributes.keys.sort_by{|k| k.to_s}.inject("") do |attrs, key| value = attributes[key] if BOOLEAN_ATTRIBUTES.include?(key) value = value ? key : nil end unless value.nil? value = case value when Array value.compact.join(" ") else value end attrs << %( #{html_escape(key)}="#{html_escape(value)}") end attrs end end end end twitter-text-rb-1.7.0/lib/twitter-text/deprecation.rb000066400000000000000000000006771224076600300226720ustar00rootroot00000000000000module Twitter module Deprecation def deprecate(method, new_method = nil) deprecated_method = :"deprecated_#{method}" message = "Deprecation: `#{method}` is deprecated." message << " Please use `#{new_method}` instead." if new_method alias_method(deprecated_method, method) define_method method do |*args, &block| warn message send(deprecated_method, *args, &block) end end end end twitter-text-rb-1.7.0/lib/twitter-text/extractor.rb000066400000000000000000000301701224076600300223770ustar00rootroot00000000000000# encoding: UTF-8 class String # Helper function to count the character length by first converting to an # array. This is needed because with unicode strings, the return value # of length may be incorrect def char_length if respond_to? :codepoints length else chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size end end # Helper function to convert this string into an array of unicode characters. def to_char_a @to_char_a ||= if chars.kind_of?(Enumerable) chars.to_a else char_array = [] 0.upto(char_length - 1) { |i| char_array << [chars.slice(i)].pack('U') } char_array end end end # Helper functions to return character offsets instead of byte offsets. class MatchData def char_begin(n) if string.respond_to? :codepoints self.begin(n) else string[0, self.begin(n)].char_length end end def char_end(n) if string.respond_to? :codepoints self.end(n) else string[0, self.end(n)].char_length end end end module Twitter # A module for including Tweet parsing in a class. This module provides function for the extraction and processing # of usernames, lists, URLs and hashtags. module Extractor extend self # Remove overlapping entities. # This returns a new array with no overlapping entities. def remove_overlapping_entities(entities) # sort by start index entities = entities.sort_by{|entity| entity[:indices].first} # remove duplicates prev = nil entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false} entities end # Extracts all usernames, lists, hashtags and URLs in the Tweet text # along with the indices for where the entity ocurred # If the text is nil or contains no entity an empty array # will be returned. # # If a block is given then it will be called for each entity. def extract_entities_with_indices(text, options = {}, &block) # extract all entities entities = extract_urls_with_indices(text, options) + extract_hashtags_with_indices(text, :check_url_overlap => false) + extract_mentions_or_lists_with_indices(text) + extract_cashtags_with_indices(text) return [] if entities.empty? entities = remove_overlapping_entities(entities) entities.each(&block) if block_given? entities end # Extracts a list of all usernames mentioned in the Tweet text. If the # text is nil or contains no username mentions an empty array # will be returned. # # If a block is given then it will be called for each username. def extract_mentioned_screen_names(text, &block) # :yields: username screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]} screen_names.each(&block) if block_given? screen_names end # Extracts a list of all usernames mentioned in the Tweet text # along with the indices for where the mention ocurred. If the # text is nil or contains no username mentions, an empty array # will be returned. # # If a block is given, then it will be called with each username, the start # index, and the end index in the text. def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end return [] unless text possible_screen_names = [] extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position| next unless list_slug.empty? possible_screen_names << { :screen_name => screen_name, :indices => [start_position, end_position] } end if block_given? possible_screen_names.each do |mention| yield mention[:screen_name], mention[:indices].first, mention[:indices].last end end possible_screen_names end # Extracts a list of all usernames or lists mentioned in the Tweet text # along with the indices for where the mention ocurred. If the # text is nil or contains no username or list mentions, an empty array # will be returned. # # If a block is given, then it will be called with each username, list slug, the start # index, and the end index in the text. The list_slug will be an empty stirng # if this is a username mention. def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end return [] unless text =~ /[@@]/ possible_entries = [] text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug| match_data = $~ after = $' unless after =~ Twitter::Regex[:end_mention_match] start_position = match_data.char_begin(3) - 1 end_position = match_data.char_end(list_slug.nil? ? 3 : 4) possible_entries << { :screen_name => screen_name, :list_slug => list_slug || "", :indices => [start_position, end_position] } end end if block_given? possible_entries.each do |mention| yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last end end possible_entries end # Extracts the username username replied to in the Tweet text. If the # text is nil or is not a reply nil will be returned. # # If a block is given then it will be called with the username replied to (if any) def extract_reply_screen_name(text) # :yields: username return nil unless text possible_screen_name = text.match(Twitter::Regex[:valid_reply]) return unless possible_screen_name.respond_to?(:captures) return if $' =~ Twitter::Regex[:end_mention_match] screen_name = possible_screen_name.captures.first yield screen_name if block_given? screen_name end # Extracts a list of all URLs included in the Tweet text. If the # text is nil or contains no URLs an empty array # will be returned. # # If a block is given then it will be called for each URL. def extract_urls(text, &block) # :yields: url urls = extract_urls_with_indices(text).map{|u| u[:url]} urls.each(&block) if block_given? urls end # Extracts a list of all URLs included in the Tweet text along # with the indices. If the text is nil or contains no # URLs an empty array will be returned. # # If a block is given then it will be called for each URL. def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":")) urls = [] position = 0 text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query| valid_url_match_data = $~ start_position = valid_url_match_data.char_begin(3) end_position = valid_url_match_data.char_end(3) # If protocol is missing and domain contains non-ASCII characters, # extract ASCII-only domains. if !protocol next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars] last_url = nil last_url_invalid_match = nil domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain| last_url = { :url => ascii_domain, :indices => [start_position + $~.char_begin(0), start_position + $~.char_end(0)] } last_url_invalid_match = ascii_domain =~ Twitter::Regex[:invalid_short_domain] urls << last_url unless last_url_invalid_match end # no ASCII-only domain found. Skip the entire URL next unless last_url # last_url only contains domain. Need to add path and query if they exist. if path # last_url was not added. Add it to urls here. urls << last_url if last_url_invalid_match last_url[:url] = url.sub(domain, last_url[:url]) last_url[:indices][1] = end_position end else # In the case of t.co URLs, don't allow additional path characters if url =~ Twitter::Regex[:valid_tco_url] url = $& end_position = start_position + url.char_length end urls << { :url => url, :indices => [start_position, end_position] } end end urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given? urls end # Extracts a list of all hashtags included in the Tweet text. If the # text is nil or contains no hashtags an empty array # will be returned. The array returned will not include the leading # # character. # # If a block is given then it will be called for each hashtag. def extract_hashtags(text, &block) # :yields: hashtag_text hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]} hashtags.each(&block) if block_given? hashtags end # Extracts a list of all hashtags included in the Tweet text. If the # text is nil or contains no hashtags an empty array # will be returned. The array returned will not include the leading # # character. # # If a block is given then it will be called for each hashtag. def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end return [] unless text =~ /[##]/ tags = [] text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text| match_data = $~ start_position = match_data.char_begin(2) end_position = match_data.char_end(3) after = $' unless after =~ Twitter::Regex[:end_hashtag_match] tags << { :hashtag => hash_text, :indices => [start_position, end_position] } end end if options[:check_url_overlap] # extract URLs urls = extract_urls_with_indices(text) unless urls.empty? tags.concat(urls) # remove duplicates tags = remove_overlapping_entities(tags) # remove URL entities tags.reject!{|entity| !entity[:hashtag] } end end tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given? tags end # Extracts a list of all cashtags included in the Tweet text. If the # text is nil or contains no cashtags an empty array # will be returned. The array returned will not include the leading $ # character. # # If a block is given then it will be called for each cashtag. def extract_cashtags(text, &block) # :yields: cashtag_text cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]} cashtags.each(&block) if block_given? cashtags end # Extracts a list of all cashtags included in the Tweet text. If the # text is nil or contains no cashtags an empty array # will be returned. The array returned will not include the leading $ # character. # # If a block is given then it will be called for each cashtag. def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end return [] unless text =~ /\$/ tags = [] text.scan(Twitter::Regex[:valid_cashtag]) do |before, dollar, cash_text| match_data = $~ start_position = match_data.char_begin(2) end_position = match_data.char_end(3) tags << { :cashtag => cash_text, :indices => [start_position, end_position] } end tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given? tags end end end twitter-text-rb-1.7.0/lib/twitter-text/hash_helper.rb000066400000000000000000000011701224076600300226440ustar00rootroot00000000000000module Twitter module HashHelper # Return a new hash with all keys converted to symbols, as long as # they respond to +to_sym+. # # { 'name' => 'Rob', 'years' => '28' }.symbolize_keys # #=> { :name => "Rob", :years => "28" } def self.symbolize_keys(hash) hash.dup.symbolize_keys! end # Destructively convert all keys to symbols, as long as they respond # to +to_sym+. Same as +symbolize_keys+, but modifies +self+. def self.symbolize_keys!(hash) hash.keys.each do |key| hash[(key.to_sym rescue key) || key] = hash.delete(key) end hash end end end twitter-text-rb-1.7.0/lib/twitter-text/hit_highlighter.rb000066400000000000000000000053401224076600300235270ustar00rootroot00000000000000module Twitter # Module for doing "hit highlighting" on tweets that have been auto-linked already. # Useful with the results returned from the Search API. module HitHighlighter extend self # Default Tag used for hit highlighting DEFAULT_HIGHLIGHT_TAG = "em" # Add tags around the hits provided in the text. The # hits should be an array of (start, end) index pairs, relative to the original # text, before auto-linking (but the text may already be auto-linked if desired) # # The tags can be overridden using the :tag option. For example: # # irb> hit_highlight("test hit here", [[5, 8]], :tag => 'strong') # => "test hit here" def hit_highlight(text, hits = [], options = {}) if hits.empty? return text end tag_name = options[:tag] || DEFAULT_HIGHLIGHT_TAG tags = ["<" + tag_name + ">", ""] chunks = text.split(/[<>]/) result = [] chunk_index, chunk = 0, chunks[0] chunk_chars = chunk.to_s.to_char_a prev_chunks_len = 0 chunk_cursor = 0 start_in_chunk = false for hit, index in hits.flatten.each_with_index do tag = tags[index % 2] placed = false until chunk.nil? || hit < prev_chunks_len + chunk.length do result << chunk_chars[chunk_cursor..-1] if start_in_chunk && hit == prev_chunks_len + chunk_chars.length result << tag placed = true end # correctly handle highlights that end on the final character. if tag_text = chunks[chunk_index+1] result << "<#{tag_text}>" end prev_chunks_len += chunk_chars.length chunk_cursor = 0 chunk_index += 2 chunk = chunks[chunk_index] chunk_chars = chunk.to_s.to_char_a start_in_chunk = false end if !placed && !chunk.nil? hit_spot = hit - prev_chunks_len result << chunk_chars[chunk_cursor...hit_spot] << tag chunk_cursor = hit_spot if index % 2 == 0 start_in_chunk = true else start_in_chunk = false end placed = true end # ultimate fallback, hits that run off the end get a closing tag if !placed result << tag end end if chunk if chunk_cursor < chunk_chars.length result << chunk_chars[chunk_cursor..-1] end (chunk_index+1).upto(chunks.length-1).each do |index| result << (index.even? ? chunks[index] : "<#{chunks[index]}>") end end result.flatten.join end end end twitter-text-rb-1.7.0/lib/twitter-text/regex.rb000066400000000000000000000424721224076600300215060ustar00rootroot00000000000000# encoding: UTF-8 module Twitter # A collection of regular expressions for parsing Tweet text. The regular expression # list is frozen at load time to ensure immutability. These regular expressions are # used throughout the Twitter classes. Special care has been taken to make # sure these reular expressions work with Tweets in all languages. class Regex REGEXEN = {} # :nodoc: def self.regex_range(from, to = nil) # :nodoc: if $RUBY_1_9 if to "\\u{#{from.to_s(16).rjust(4, '0')}}-\\u{#{to.to_s(16).rjust(4, '0')}}" else "\\u{#{from.to_s(16).rjust(4, '0')}}" end else if to [from].pack('U') + '-' + [to].pack('U') else [from].pack('U') end end end # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand # to access both the list of characters and a pattern suitible for use with String#split # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE UNICODE_SPACES = [ (0x0009..0x000D).to_a, # White_Space # Cc [5] .. 0x0020, # White_Space # Zs SPACE 0x0085, # White_Space # Cc 0x00A0, # White_Space # Zs NO-BREAK SPACE 0x1680, # White_Space # Zs OGHAM SPACE MARK 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE 0x2028, # White_Space # Zl LINE SEPARATOR 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE ].flatten.map{|c| [c].pack('U*')}.freeze REGEXEN[:spaces] = /[#{UNICODE_SPACES.join('')}]/o # Character not allowed in Tweets INVALID_CHARACTERS = [ 0xFFFE, 0xFEFF, # BOM 0xFFFF, # Special 0x202A, 0x202B, 0x202C, 0x202D, 0x202E # Directional change ].map{|cp| [cp].pack('U') }.freeze REGEXEN[:invalid_control_characters] = /[#{INVALID_CHARACTERS.join('')}]/o major, minor, patch = RUBY_VERSION.split('.') if major.to_i >= 2 || major.to_i == 1 && minor.to_i >= 9 || (defined?(RUBY_ENGINE) && ["jruby", "rbx"].include?(RUBY_ENGINE)) REGEXEN[:list_name] = /[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}/ else # This line barfs at compile time in Ruby 1.9, JRuby, or Rubinius. REGEXEN[:list_name] = eval("/[a-zA-Z][a-zA-Z0-9_\\-\x80-\xff]{0,24}/") end # Latin accented characters # Excludes 0xd7 from the range (the multiplication sign, confusable with "x"). # Also excludes 0xf7, the division sign LATIN_ACCENTS = [ regex_range(0xc0, 0xd6), regex_range(0xd8, 0xf6), regex_range(0xf8, 0xff), regex_range(0x0100, 0x024f), regex_range(0x0253, 0x0254), regex_range(0x0256, 0x0257), regex_range(0x0259), regex_range(0x025b), regex_range(0x0263), regex_range(0x0268), regex_range(0x026f), regex_range(0x0272), regex_range(0x0289), regex_range(0x028b), regex_range(0x02bb), regex_range(0x0300, 0x036f), regex_range(0x1e00, 0x1eff) ].join('').freeze RTL_CHARACTERS = [ regex_range(0x0600,0x06FF), regex_range(0x0750,0x077F), regex_range(0x0590,0x05FF), regex_range(0xFE70,0xFEFF) ].join('').freeze NON_LATIN_HASHTAG_CHARS = [ # Cyrillic (Russian, Ukrainian, etc.) regex_range(0x0400, 0x04ff), # Cyrillic regex_range(0x0500, 0x0527), # Cyrillic Supplement regex_range(0x2de0, 0x2dff), # Cyrillic Extended A regex_range(0xa640, 0xa69f), # Cyrillic Extended B regex_range(0x0591, 0x05bf), # Hebrew regex_range(0x05c1, 0x05c2), regex_range(0x05c4, 0x05c5), regex_range(0x05c7), regex_range(0x05d0, 0x05ea), regex_range(0x05f0, 0x05f4), regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms regex_range(0xfb2a, 0xfb36), regex_range(0xfb38, 0xfb3c), regex_range(0xfb3e), regex_range(0xfb40, 0xfb41), regex_range(0xfb43, 0xfb44), regex_range(0xfb46, 0xfb4f), regex_range(0x0610, 0x061a), # Arabic regex_range(0x0620, 0x065f), regex_range(0x066e, 0x06d3), regex_range(0x06d5, 0x06dc), regex_range(0x06de, 0x06e8), regex_range(0x06ea, 0x06ef), regex_range(0x06fa, 0x06fc), regex_range(0x06ff), regex_range(0x0750, 0x077f), # Arabic Supplement regex_range(0x08a0), # Arabic Extended A regex_range(0x08a2, 0x08ac), regex_range(0x08e4, 0x08fe), regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A regex_range(0xfbd3, 0xfd3d), regex_range(0xfd50, 0xfd8f), regex_range(0xfd92, 0xfdc7), regex_range(0xfdf0, 0xfdfb), regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B regex_range(0xfe76, 0xfefc), regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner regex_range(0x0e01, 0x0e3a), # Thai regex_range(0x0e40, 0x0e4e), # Hangul (Korean) regex_range(0x1100, 0x11ff), # Hangul Jamo regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A regex_range(0xAC00, 0xD7AF), # Hangul Syllables regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B regex_range(0xFFA1, 0xFFDC) # Half-width Hangul ].join('').freeze REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o CJ_HASHTAG_CHARACTERS = [ regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width) regex_range(0xFF66, 0xFF9F), # Katakana (half-width) regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width) regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A) regex_range(0x4E00, 0x9FFF), # Kanji (Unified) regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B) regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C) regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D) regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement) ].join('').freeze PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~' SPACE_CHARS = " \t\n\x0B\f\r" CTRL_CHARS = "\x00-\x1F\x7F" # A hashtag must contain latin characters, numbers and underscores, but not all numbers. HASHTAG_ALPHA = /[a-z_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io HASHTAG_ALPHANUMERIC = /[a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/io HASHTAG_BOUNDARY = /\A|\z|[^&a-z0-9_#{LATIN_ACCENTS}#{NON_LATIN_HASHTAG_CHARS}#{CJ_HASHTAG_CHARACTERS}]/o HASHTAG = /(#{HASHTAG_BOUNDARY})(#|#)(#{HASHTAG_ALPHANUMERIC}*#{HASHTAG_ALPHA}#{HASHTAG_ALPHANUMERIC}*)/io REGEXEN[:valid_hashtag] = /#{HASHTAG}/io # Used in Extractor for final filtering REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o REGEXEN[:valid_mention_preceding_chars] = /(?:[^a-zA-Z0-9_!#\$%&*@@]|^|[rR][tT]:?)/o REGEXEN[:at_signs] = /[@@]/ REGEXEN[:valid_mention_or_list] = / (#{REGEXEN[:valid_mention_preceding_chars]}) # $1: Preceeding character (#{REGEXEN[:at_signs]}) # $2: At mark ([a-zA-Z0-9_]{1,20}) # $3: Screen name (\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional) /ox REGEXEN[:valid_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o # Used in Extractor for final filtering REGEXEN[:end_mention_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o # URL related hash regex collection REGEXEN[:valid_url_preceding_chars] = /(?:[^A-Z0-9@@$###{INVALID_CHARACTERS.join('')}]|^)/io REGEXEN[:invalid_url_without_protocol_preceding_chars] = /[-_.\/]$/ DOMAIN_VALID_CHARS = "[^#{PUNCTUATION_CHARS}#{SPACE_CHARS}#{CTRL_CHARS}#{INVALID_CHARACTERS.join('')}#{UNICODE_SPACES.join('')}]" REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^0-9a-z@]|$))/i REGEXEN[:valid_ccTLD] = %r{ (?: (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch| ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm| gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li| lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe| pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg| th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw) (?=[^0-9a-z@]|$) ) }ix REGEXEN[:valid_punycode] = /(?:xn--[0-9a-z]+)/i REGEXEN[:valid_domain] = /(?: #{REGEXEN[:valid_subdomain]}*#{REGEXEN[:valid_domain_name]} (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]}) )/iox # This is used in Extractor REGEXEN[:valid_ascii_domain] = / (?:(?:[A-Za-z0-9\-_]|#{REGEXEN[:latin_accents]})+\.)+ (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]}) /iox # This is used in Extractor for stricter t.co URL extraction REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i # This is used in Extractor to filter out unwanted URLs. REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io REGEXEN[:valid_port_number] = /[0-9]+/ REGEXEN[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\.\$\/%#\[\]\-_~&|@#{LATIN_ACCENTS}]/io # Allow URL paths to contain up to two nested levels of balanced parens # 1. Used in Wikipedia URLs like /Primer_(film) # 2. Used in IIS sessions like /S(dfd346)/ # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/ REGEXEN[:valid_url_balanced_parens] = / \( (?: #{REGEXEN[:valid_general_url_path_chars]}+ | # allow one nested level of balanced parentheses (?: #{REGEXEN[:valid_general_url_path_chars]}* \( #{REGEXEN[:valid_general_url_path_chars]}+ \) #{REGEXEN[:valid_general_url_path_chars]}* ) ) \) /iox # Valid end-of-path chracters (so /foo. does not gobble the period). # 1. Allow =&# for empty URL parameters and other URL-join artifacts REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-#{LATIN_ACCENTS}]|(?:#{REGEXEN[:valid_url_balanced_parens]})/io REGEXEN[:valid_url_path] = /(?: (?: #{REGEXEN[:valid_general_url_path_chars]}* (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)* #{REGEXEN[:valid_url_path_ending_chars]} )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/) )/iox REGEXEN[:valid_url_query_chars] = /[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i REGEXEN[:valid_url] = %r{ ( # $1 total match (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceeding chracter ( # $3 URL (https?:\/\/)? # $4 Protocol (optional) (#{REGEXEN[:valid_domain]}) # $5 Domain(s) (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional) (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String ) ) }iox REGEXEN[:cashtag] = /[a-z]{1,6}(?:[._][a-z]{1,2})?/i REGEXEN[:valid_cashtag] = /(^|#{REGEXEN[:spaces]})(\$)(#{REGEXEN[:cashtag]})(?=$|\s|[#{PUNCTUATION_CHARS}])/i # These URL validation pattern strings are based on the ABNF from RFC 3986 REGEXEN[:validate_url_unreserved] = /[a-z0-9\-._~]/i REGEXEN[:validate_url_pct_encoded] = /(?:%[0-9a-f]{2})/i REGEXEN[:validate_url_sub_delims] = /[!$&'()*+,;=]/i REGEXEN[:validate_url_pchar] = /(?: #{REGEXEN[:validate_url_unreserved]}| #{REGEXEN[:validate_url_pct_encoded]}| #{REGEXEN[:validate_url_sub_delims]}| [:\|@] )/iox REGEXEN[:validate_url_scheme] = /(?:[a-z][a-z0-9+\-.]*)/i REGEXEN[:validate_url_userinfo] = /(?: #{REGEXEN[:validate_url_unreserved]}| #{REGEXEN[:validate_url_pct_encoded]}| #{REGEXEN[:validate_url_sub_delims]}| : )*/iox REGEXEN[:validate_url_dec_octet] = /(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i REGEXEN[:validate_url_ipv4] = /(?:#{REGEXEN[:validate_url_dec_octet]}(?:\.#{REGEXEN[:validate_url_dec_octet]}){3})/iox # Punting on real IPv6 validation for now REGEXEN[:validate_url_ipv6] = /(?:\[[a-f0-9:\.]+\])/i # Also punting on IPvFuture for now REGEXEN[:validate_url_ip] = /(?: #{REGEXEN[:validate_url_ipv4]}| #{REGEXEN[:validate_url_ipv6]} )/iox # This is more strict than the rfc specifies REGEXEN[:validate_url_subdomain_segment] = /(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i REGEXEN[:validate_url_domain_segment] = /(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i REGEXEN[:validate_url_domain_tld] = /(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i REGEXEN[:validate_url_domain] = /(?:(?:#{REGEXEN[:validate_url_subdomain_segment]}\.)* (?:#{REGEXEN[:validate_url_domain_segment]}\.) #{REGEXEN[:validate_url_domain_tld]})/iox REGEXEN[:validate_url_host] = /(?: #{REGEXEN[:validate_url_ip]}| #{REGEXEN[:validate_url_domain]} )/iox # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences REGEXEN[:validate_url_unicode_subdomain_segment] = /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix REGEXEN[:validate_url_unicode_domain_segment] = /(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix REGEXEN[:validate_url_unicode_domain_tld] = /(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix REGEXEN[:validate_url_unicode_domain] = /(?:(?:#{REGEXEN[:validate_url_unicode_subdomain_segment]}\.)* (?:#{REGEXEN[:validate_url_unicode_domain_segment]}\.) #{REGEXEN[:validate_url_unicode_domain_tld]})/iox REGEXEN[:validate_url_unicode_host] = /(?: #{REGEXEN[:validate_url_ip]}| #{REGEXEN[:validate_url_unicode_domain]} )/iox REGEXEN[:validate_url_port] = /[0-9]{1,5}/ REGEXEN[:validate_url_unicode_authority] = %r{ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo (#{REGEXEN[:validate_url_unicode_host]}) # $2 host (?::(#{REGEXEN[:validate_url_port]}))? # $3 port }iox REGEXEN[:validate_url_authority] = %r{ (?:(#{REGEXEN[:validate_url_userinfo]})@)? # $1 userinfo (#{REGEXEN[:validate_url_host]}) # $2 host (?::(#{REGEXEN[:validate_url_port]}))? # $3 port }iox REGEXEN[:validate_url_path] = %r{(/#{REGEXEN[:validate_url_pchar]}*)*}i REGEXEN[:validate_url_query] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i REGEXEN[:validate_url_fragment] = %r{(#{REGEXEN[:validate_url_pchar]}|/|\?)*}i # Modified version of RFC 3986 Appendix B REGEXEN[:validate_url_unencoded] = %r{ \A # Full URL (?: ([^:/?#]+):// # $1 Scheme )? ([^/?#]*) # $2 Authority ([^?#]*) # $3 Path (?: \?([^#]*) # $4 Query )? (?: \#(.*) # $5 Fragment )?\Z }ix REGEXEN[:rtl_chars] = /[#{RTL_CHARACTERS}]/io REGEXEN.each_pair{|k,v| v.freeze } # Return the regular expression for a given key. If the key # is not a known symbol a nil will be returned. def self.[](key) REGEXEN[key] end end end twitter-text-rb-1.7.0/lib/twitter-text/rewriter.rb000066400000000000000000000036251224076600300222340ustar00rootroot00000000000000module Twitter # A module provides base methods to rewrite usernames, lists, hashtags and URLs. module Rewriter extend self def rewrite_entities(text, entities) chars = text.to_s.to_char_a # sort by start index entities = entities.sort_by{|entity| entity[:indices].first} result = [] last_index = entities.inject(0) do |last_index, entity| result << chars[last_index...entity[:indices].first] result << yield(entity, chars) entity[:indices].last end result << chars[last_index..-1] result.flatten.join end # These methods are deprecated, will be removed in future. extend Deprecation def rewrite(text, options = {}) [:hashtags, :urls, :usernames_or_lists].inject(text) do |key| options[key] ? send(:"rewrite_#{key}", text, &options[key]) : text end end deprecate :rewrite, :rewrite_entities def rewrite_usernames_or_lists(text) entities = Extractor.extract_mentions_or_lists_with_indices(text) rewrite_entities(text, entities) do |entity, chars| at = chars[entity[:indices].first] list_slug = entity[:list_slug] list_slug = nil if list_slug.empty? yield(at, entity[:screen_name], list_slug) end end deprecate :rewrite_usernames_or_lists, :rewrite_entities def rewrite_hashtags(text) entities = Extractor.extract_hashtags_with_indices(text) rewrite_entities(text, entities) do |entity, chars| hash = chars[entity[:indices].first] yield(hash, entity[:hashtag]) end end deprecate :rewrite_hashtags, :rewrite_entities def rewrite_urls(text) entities = Extractor.extract_urls_with_indices(text, :extract_url_without_protocol => false) rewrite_entities(text, entities) do |entity, chars| yield(entity[:url]) end end deprecate :rewrite_urls, :rewrite_entities end end twitter-text-rb-1.7.0/lib/twitter-text/unicode.rb000066400000000000000000000016241224076600300220140ustar00rootroot00000000000000module Twitter # This module lazily defines constants of the form Uxxxx for all Unicode # codepoints from U0000 to U10FFFF. The value of each constant is the # UTF-8 string for the codepoint. # Examples: # copyright = Unicode::U00A9 # euro = Unicode::U20AC # infinity = Unicode::U221E # module Unicode CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/ def self.const_missing(name) # Check that the constant name is of the right form: U0000 to U10FFFF if name.to_s =~ CODEPOINT_REGEX # Convert the codepoint to an immutable UTF-8 string, # define a real constant for that value and return the value #p name, name.class const_set(name, [$1.to_i(16)].pack("U").freeze) else # Raise an error for constants that are not Unicode. raise NameError, "Uninitialized constant: Unicode::#{name}" end end end end twitter-text-rb-1.7.0/lib/twitter-text/validation.rb000066400000000000000000000107711224076600300225230ustar00rootroot00000000000000require 'unf' module Twitter module Validation extend self MAX_LENGTH = 140 DEFAULT_TCO_URL_LENGTHS = { :short_url_length => 22, :short_url_length_https => 23, :characters_reserved_per_media => 22 }.freeze # Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC # (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a # string no matter which actual form was transmitted. For example: # # U+0065 Latin Small Letter E # + U+0301 Combining Acute Accent # ---------- # = 2 bytes, 2 characters, displayed as é (1 visual glyph) # … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1 # # The string could also contain U+00E9 already, in which case the canonicalization will not change the value. # def tweet_length(text, options = {}) options = DEFAULT_TCO_URL_LENGTHS.merge(options) length = text.to_nfc.unpack("U*").length Twitter::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position| length += start_position - end_position length += url.downcase =~ /^https:\/\// ? options[:short_url_length_https] : options[:short_url_length] end length end # Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation # before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation # will allow quicker feedback. # # Returns false if this text is valid. Otherwise one of the following Symbols will be returned: # # :too_long:: if the text is too long # :empty:: if the text is nil or empty # :invalid_characters:: if the text contains non-Unicode or any of the disallowed Unicode characters def tweet_invalid?(text) return :empty if !text || text.empty? begin return :too_long if tweet_length(text) > MAX_LENGTH return :invalid_characters if Twitter::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } rescue ArgumentError => e # non-Unicode value. return :invalid_characters end return false end def valid_tweet_text?(text) !tweet_invalid?(text) end def valid_username?(username) return false if !username || username.empty? extracted = Twitter::Extractor.extract_mentioned_screen_names(username) # Should extract the username minus the @ sign, hence the [1..-1] extracted.size == 1 && extracted.first == username[1..-1] end VALID_LIST_RE = /\A#{Twitter::Regex[:valid_mention_or_list]}\z/o def valid_list?(username_list) match = username_list.match(VALID_LIST_RE) # Must have matched and had nothing before or after !!(match && match[1] == "" && match[4] && !match[4].empty?) end def valid_hashtag?(hashtag) return false if !hashtag || hashtag.empty? extracted = Twitter::Extractor.extract_hashtags(hashtag) # Should extract the hashtag minus the # sign, hence the [1..-1] extracted.size == 1 && extracted.first == hashtag[1..-1] end def valid_url?(url, unicode_domains=true, require_protocol=true) return false if !url || url.empty? url_parts = url.match(Twitter::Regex[:validate_url_unencoded]) return false unless (url_parts && url_parts.to_s == url) scheme, authority, path, query, fragment = url_parts.captures return false unless ((!require_protocol || (valid_match?(scheme, Twitter::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) && valid_match?(path, Twitter::Regex[:validate_url_path]) && valid_match?(query, Twitter::Regex[:validate_url_query], true) && valid_match?(fragment, Twitter::Regex[:validate_url_fragment], true)) return (unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_unicode_authority])) || (!unicode_domains && valid_match?(authority, Twitter::Regex[:validate_url_authority])) end private def valid_match?(string, regex, optional=false) return (string && string.match(regex) && $~.to_s == string) unless optional !(string && (!string.match(regex) || $~.to_s != string)) end end end twitter-text-rb-1.7.0/script/000077500000000000000000000000001224076600300161105ustar00rootroot00000000000000twitter-text-rb-1.7.0/script/destroy000077500000000000000000000005601224076600300175300ustar00rootroot00000000000000#!/usr/bin/env ruby APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..')) begin require 'rubigen' rescue LoadError require 'rubygems' require 'rubigen' end require 'rubigen/scripts/destroy' ARGV.shift if ['--help', '-h'].include?(ARGV[0]) RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit] RubiGen::Scripts::Destroy.new.run(ARGV) twitter-text-rb-1.7.0/script/generate000077500000000000000000000005621224076600300176330ustar00rootroot00000000000000#!/usr/bin/env ruby APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..')) begin require 'rubigen' rescue LoadError require 'rubygems' require 'rubigen' end require 'rubigen/scripts/generate' ARGV.shift if ['--help', '-h'].include?(ARGV[0]) RubiGen::Base.use_component_sources! [:newgem_simple, :test_unit] RubiGen::Scripts::Generate.new.run(ARGV) twitter-text-rb-1.7.0/spec/000077500000000000000000000000001224076600300155365ustar00rootroot00000000000000twitter-text-rb-1.7.0/spec/autolinking_spec.rb000066400000000000000000000727151224076600300214350ustar00rootroot00000000000000# encoding: utf-8 require File.dirname(__FILE__) + '/spec_helper' class TestAutolink include Twitter::Autolink end describe Twitter::Autolink do def original_text; end def url; end describe "auto_link_custom" do before do @autolinked_text = TestAutolink.new.auto_link(original_text) if original_text end describe "username autolinking" do context "username preceded by a space" do def original_text; "hello @jacob"; end it "should be linked" do @autolinked_text.should link_to_screen_name('jacob') end end context "username at beginning of line" do def original_text; "@jacob you're cool"; end it "should be linked" do @autolinked_text.should link_to_screen_name('jacob') end end context "username preceded by word character" do def original_text; "meet@the beach"; end it "should not be linked" do Nokogiri::HTML(@autolinked_text).search('a').should be_empty end end context "username preceded by non-word character" do def original_text; "great.@jacob"; end it "should be linked" do @autolinked_text.should link_to_screen_name('jacob') end end context "username containing non-word characters" do def original_text; "@zach&^$%^"; end it "should not be linked" do @autolinked_text.should link_to_screen_name('zach') end end context "username over twenty characters" do def original_text @twenty_character_username = "zach" * 5 "@" + @twenty_character_username + "1" end it "should not be linked" do @autolinked_text.should link_to_screen_name(@twenty_character_username) end end context "username followed by japanese" do def original_text; "@jacobの"; end it "should be linked" do @autolinked_text.should link_to_screen_name('jacob') end end context "username preceded by japanese" do def original_text; "あ@matz"; end it "should be linked" do @autolinked_text.should link_to_screen_name('matz') end end context "username surrounded by japanese" do def original_text; "あ@yoshimiの"; end it "should be linked" do @autolinked_text.should link_to_screen_name('yoshimi') end end context "username using full-width at-sign" do def original_text "#{[0xFF20].pack('U')}jacob" end it "should be linked" do @autolinked_text.should link_to_screen_name('jacob') end end end describe "list path autolinking" do context "when List is not available" do it "should not be linked" do @autolinked_text = TestAutolink.new.auto_link_usernames_or_lists("hello @jacob/my-list", :suppress_lists => true) @autolinked_text.should_not link_to_list_path('jacob/my-list') @autolinked_text.should include('my-list') end end context "slug preceded by a space" do def original_text; "hello @jacob/my-list"; end it "should be linked" do @autolinked_text.should link_to_list_path('jacob/my-list') end end context "username followed by a slash but no list" do def original_text; "hello @jacob/ my-list"; end it "should NOT be linked" do @autolinked_text.should_not link_to_list_path('jacob/my-list') @autolinked_text.should link_to_screen_name('jacob') end end context "empty username followed by a list" do def original_text; "hello @/my-list"; end it "should NOT be linked" do Nokogiri::HTML(@autolinked_text).search('a').should be_empty end end context "list slug at beginning of line" do def original_text; "@jacob/my-list"; end it "should be linked" do @autolinked_text.should link_to_list_path('jacob/my-list') end end context "username preceded by alpha-numeric character" do def original_text; "meet@the/beach"; end it "should not be linked" do Nokogiri::HTML(@autolinked_text).search('a').should be_empty end end context "username preceded by non-word character" do def original_text; "great.@jacob/my-list"; end it "should be linked" do @autolinked_text = TestAutolink.new.auto_link("great.@jacob/my-list") @autolinked_text.should link_to_list_path('jacob/my-list') end end context "username containing non-word characters" do def original_text; "@zach/test&^$%^"; end it "should be linked" do @autolinked_text.should link_to_list_path('zach/test') end end context "username over twenty characters" do def original_text @twentyfive_character_list = "jack/" + ("a" * 25) "@#{@twentyfive_character_list}12345" end it "should be linked" do @autolinked_text.should link_to_list_path(@twentyfive_character_list) end end end describe "hashtag autolinking" do context "with an all numeric hashtag" do def original_text; "#123"; end it "should not be linked" do @autolinked_text.should_not have_autolinked_hashtag('#123') end end context "with a hashtag with alphanumeric characters" do def original_text; "#ab1d"; end it "should be linked" do @autolinked_text.should have_autolinked_hashtag('#ab1d') end end context "with a hashtag with underscores" do def original_text; "#a_b_c_d"; end it "should be linked" do @autolinked_text.should have_autolinked_hashtag(original_text) end end context "with a hashtag that is preceded by a word character" do def original_text; "ab#cd"; end it "should not be linked" do @autolinked_text.should_not have_autolinked_hashtag(original_text) end end context "with a page anchor in a url" do def original_text; "Here's my url: http://foobar.com/#home"; end it "should not link the hashtag" do @autolinked_text.should_not have_autolinked_hashtag('#home') end it "should link the url" do @autolinked_text.should have_autolinked_url('http://foobar.com/#home') end end context "with a hashtag that starts with a number but has word characters" do def original_text; "#2ab"; end it "should be linked" do @autolinked_text.should have_autolinked_hashtag(original_text) end end context "with multiple valid hashtags" do def original_text; "I'm frickin' awesome #ab #cd #ef"; end it "links each hashtag" do @autolinked_text.should have_autolinked_hashtag('#ab') @autolinked_text.should have_autolinked_hashtag('#cd') @autolinked_text.should have_autolinked_hashtag('#ef') end end context "with a hashtag preceded by a ." do def original_text; "ok, great.#abc"; end it "should be linked" do @autolinked_text.should have_autolinked_hashtag('#abc') end end context "with a hashtag preceded by a &" do def original_text; "&#nbsp;"; end it "should not be linked" do @autolinked_text.should_not have_autolinked_hashtag('#nbsp;') end end context "with a hashtag that ends in an !" do def original_text; "#great!"; end it "should be linked, but should not include the !" do @autolinked_text.should have_autolinked_hashtag('#great') end end context "with a hashtag followed by Japanese" do def original_text; "#twj_devの"; end it "should be linked" do @autolinked_text.should have_autolinked_hashtag('#twj_devの') end end context "with a hashtag preceded by a full-width space" do def original_text; "#{[0x3000].pack('U')}#twj_dev"; end it "should be linked" do @autolinked_text.should have_autolinked_hashtag('#twj_dev') end end context "with a hashtag followed by a full-width space" do def original_text; "#twj_dev#{[0x3000].pack('U')}"; end it "should be linked" do @autolinked_text.should have_autolinked_hashtag('#twj_dev') end end context "with a hashtag using full-width hash" do def original_text; "#{[0xFF03].pack('U')}twj_dev"; end it "should be linked" do link = Nokogiri::HTML(@autolinked_text).search('a') (link.inner_text.respond_to?(:force_encoding) ? link.inner_text.force_encoding("utf-8") : link.inner_text).should == "#{[0xFF03].pack('U')}twj_dev" link.first['href'].should == 'https://twitter.com/#!/search?q=%23twj_dev' end end context "with a hashtag containing an accented latin character" do def original_text # the hashtag is #éhashtag "##{[0x00e9].pack('U')}hashtag" end it "should be linked" do @autolinked_text.should == "#éhashtag" end end end describe "URL autolinking" do def url; "http://www.google.com"; end context "when embedded in plain text" do def original_text; "On my search engine #{url} I found good links."; end it "should be linked" do @autolinked_text.should have_autolinked_url(url) end end context "when surrounded by Japanese;" do def original_text; "いまなにしてる#{url}いまなにしてる"; end it "should be linked" do @autolinked_text.should have_autolinked_url(url) end end context "with a path surrounded by parentheses;" do def original_text; "I found a neatness (#{url})"; end it "should be linked" do @autolinked_text.should have_autolinked_url(url) end context "when the URL ends with a slash;" do def url; "http://www.google.com/"; end it "should be linked" do @autolinked_text.should have_autolinked_url(url) end end context "when the URL has a path;" do def url; "http://www.google.com/fsdfasdf"; end it "should be linked" do @autolinked_text.should have_autolinked_url(url) end end end context "when path contains parens" do def original_text; "I found a neatness (#{url})"; end it "should be linked" do @autolinked_text.should have_autolinked_url(url) end context "wikipedia" do def url; "http://en.wikipedia.org/wiki/Madonna_(artist)"; end it "should be linked" do @autolinked_text.should have_autolinked_url(url) end end context "IIS session" do def url; "http://msdn.com/S(deadbeef)/page.htm"; end it "should be linked" do @autolinked_text.should have_autolinked_url(url) end end context "unbalanced parens" do def url; "http://example.com/i_has_a_("; end it "should be linked" do @autolinked_text.should have_autolinked_url("http://example.com/i_has_a_") end end context "balanced parens with a double quote inside" do def url; "http://foo.com/foo_(\")_bar" end it "should be linked" do @autolinked_text.should have_autolinked_url("http://foo.com/foo_") end end context "balanced parens hiding XSS" do def url; 'http://x.xx.com/("style="color:red"onmouseover="alert(1)' end it "should be linked" do @autolinked_text.should have_autolinked_url("http://x.xx.com/") end end end context "when preceded by a :" do def original_text; "Check this out @hoverbird:#{url}"; end it "should be linked" do @autolinked_text.should have_autolinked_url(url) end end context "with a URL ending in allowed punctuation" do it "does not consume ending punctuation" do matcher = TestAutolink.new %w| ? ! , . : ; ] ) } = \ ' |.each do |char| matcher.auto_link("#{url}#{char}").should have_autolinked_url(url) end end end context "with a URL preceded in forbidden characters" do it "should be linked" do matcher = TestAutolink.new %w| \ ' / ! = |.each do |char| matcher.auto_link("#{char}#{url}").should have_autolinked_url(url) end end end context "when embedded in a link tag" do def original_text; "#{url}"; end it "should be linked" do @autolinked_text.should have_autolinked_url(url) end end context "with multiple URLs" do def original_text; "http://www.links.org link at start of page, link at end http://www.foo.org"; end it "should autolink each one" do @autolinked_text.should have_autolinked_url('http://www.links.org') @autolinked_text.should have_autolinked_url('http://www.foo.org') end end context "with multiple URLs in different formats" do def original_text; "http://foo.com https://bar.com http://mail.foobar.org"; end it "should autolink each one, in the proper order" do @autolinked_text.should have_autolinked_url('http://foo.com') @autolinked_text.should have_autolinked_url('https://bar.com') @autolinked_text.should have_autolinked_url('http://mail.foobar.org') end end context "with a URL having a long TLD" do def original_text; "Yahoo integriert Facebook http://golem.mobi/0912/71607.html"; end it "should autolink it" do @autolinked_text.should have_autolinked_url('http://golem.mobi/0912/71607.html') end end context "with a url lacking the protocol" do def original_text; "I like www.foobar.com dudes"; end it "does not link at all" do link = Nokogiri::HTML(@autolinked_text).search('a') link.should be_empty end end context "with a @ in a URL" do context "with XSS attack" do def original_text; 'http://x.xx.com/@"style="color:pink"onmouseover=alert(1)//'; end it "should not allow XSS follwing @" do @autolinked_text.should have_autolinked_url('http://x.xx.com/') end end context "with a username not followed by a /" do def original_text; 'http://example.com/@foobar'; end it "should link url" do @autolinked_text.should have_autolinked_url('http://example.com/@foobar') end end context "with a username followed by a /" do def original_text; 'http://example.com/@foobar/'; end it "should not link the username but link full url" do @autolinked_text.should have_autolinked_url('http://example.com/@foobar/') @autolinked_text.should_not link_to_screen_name('foobar') end end end context "regex engine quirks" do context "does not spiral out of control on repeated periods" do def original_text; "Test a ton of periods http://example.com/path.........................................."; end it "should autolink" do @autolinked_text.should have_autolinked_url('http://example.com/path') end end context "does not spiral out of control on repeated dashes" do def original_text; "Single char file ext http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188"; end it "should autolink" do @autolinked_text.should have_autolinked_url('http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188') end end end end describe "Autolink all" do before do @linker = TestAutolink.new end it "should allow url/hashtag overlap" do auto_linked = @linker.auto_link("https://twitter.com/#search") auto_linked.should have_autolinked_url('https://twitter.com/#search') end it "should not add invalid option in HTML tags" do auto_linked = @linker.auto_link("https://twitter.com/ is a URL, not a hashtag", :hashtag_class => 'hashtag_classname') auto_linked.should have_autolinked_url('https://twitter.com/') auto_linked.should_not include('hashtag_class') auto_linked.should_not include('hashtag_classname') end it "should autolink url/hashtag/mention in text with Unicode supplementary characters" do auto_linked = @linker.auto_link("#{[0x10400].pack('U')} #hashtag #{[0x10400].pack('U')} @mention #{[0x10400].pack('U')} http://twitter.com/") auto_linked.should have_autolinked_hashtag('#hashtag') auto_linked.should link_to_screen_name('mention') auto_linked.should have_autolinked_url('http://twitter.com/') end end end describe "autolinking options" do before do @linker = TestAutolink.new end it "should show display_url when :url_entities provided" do linked = @linker.auto_link("http://t.co/0JG5Mcq", :url_entities => [{ "url" => "http://t.co/0JG5Mcq", "display_url" => "blog.twitter.com/2011/05/twitte…", "expanded_url" => "http://blog.twitter.com/2011/05/twitter-for-mac-update.html", "indices" => [ 84, 103 ] }]) html = Nokogiri::HTML(linked) html.search('a').should_not be_empty html.search('a[@href="http://t.co/0JG5Mcq"]').should_not be_empty html.search('span[@class=js-display-url]').inner_text.should == "blog.twitter.com/2011/05/twitte" html.inner_text.should == " http://blog.twitter.com/2011/05/twitter-for-mac-update.html …" html.search('span[@style="position:absolute;left:-9999px;"]').size.should == 4 end it "should accept invisible_tag_attrs option" do linked = @linker.auto_link("http://t.co/0JG5Mcq", { :url_entities => [{ "url" => "http://t.co/0JG5Mcq", "display_url" => "blog.twitter.com/2011/05/twitte…", "expanded_url" => "http://blog.twitter.com/2011/05/twitter-for-mac-update.html", "indices" => [ 0, 19 ] }], :invisible_tag_attrs => "style='dummy;'" }) html = Nokogiri::HTML(linked) html.search('span[@style="dummy;"]').size.should == 4 end it "should show display_url if available in entity" do linked = @linker.auto_link_entities("http://t.co/0JG5Mcq", [{ :url => "http://t.co/0JG5Mcq", :display_url => "blog.twitter.com/2011/05/twitte…", :expanded_url => "http://blog.twitter.com/2011/05/twitter-for-mac-update.html", :indices => [0, 19] }] ) html = Nokogiri::HTML(linked) html.search('a').should_not be_empty html.search('a[@href="http://t.co/0JG5Mcq"]').should_not be_empty html.search('span[@class=js-display-url]').inner_text.should == "blog.twitter.com/2011/05/twitte" html.inner_text.should == " http://blog.twitter.com/2011/05/twitter-for-mac-update.html …" end it "should apply :class as a CSS class" do linked = @linker.auto_link("http://example.com/", :class => 'myclass') linked.should have_autolinked_url('http://example.com/') linked.should match(/myclass/) end it "should apply :url_class only on URL" do linked = @linker.auto_link("http://twitter.com") linked.should have_autolinked_url('http://twitter.com') linked.should_not match(/class/) linked = @linker.auto_link("http://twitter.com", :url_class => 'testClass') linked.should have_autolinked_url('http://twitter.com') linked.should match(/class=\"testClass\"/) linked = @linker.auto_link("#hash @tw", :url_class => 'testClass') linked.should match(/class=\"tweet-url hashtag\"/) linked.should match(/class=\"tweet-url username\"/) linked.should_not match(/class=\"testClass\"/) end it "should add rel=nofollow by default" do linked = @linker.auto_link("http://example.com/") linked.should have_autolinked_url('http://example.com/') linked.should match(/nofollow/) end it "should include the '@' symbol in a username when passed :username_include_symbol" do linked = @linker.auto_link("@user", :username_include_symbol => true) linked.should link_to_screen_name('user', '@user') end it "should include the '@' symbol in a list when passed :username_include_symbol" do linked = @linker.auto_link("@user/list", :username_include_symbol => true) linked.should link_to_list_path('user/list', '@user/list') end it "should not add rel=nofollow when passed :suppress_no_follow" do linked = @linker.auto_link("http://example.com/", :suppress_no_follow => true) linked.should have_autolinked_url('http://example.com/') linked.should_not match(/nofollow/) end it "should not add a target attribute by default" do linked = @linker.auto_link("http://example.com/") linked.should have_autolinked_url('http://example.com/') linked.should_not match(/target=/) end it "should respect the :target option" do linked = @linker.auto_link("http://example.com/", :target => 'mywindow') linked.should have_autolinked_url('http://example.com/') linked.should match(/target="mywindow"/) end it "should customize href by username_url_block option" do linked = @linker.auto_link("@test", :username_url_block => lambda{|a| "dummy"}) linked.should have_autolinked_url('dummy', 'test') end it "should customize href by list_url_block option" do linked = @linker.auto_link("@test/list", :list_url_block => lambda{|a| "dummy"}) linked.should have_autolinked_url('dummy', 'test/list') end it "should customize href by hashtag_url_block option" do linked = @linker.auto_link("#hashtag", :hashtag_url_block => lambda{|a| "dummy"}) linked.should have_autolinked_url('dummy', '#hashtag') end it "should customize href by cashtag_url_block option" do linked = @linker.auto_link("$CASH", :cashtag_url_block => lambda{|a| "dummy"}) linked.should have_autolinked_url('dummy', '$CASH') end it "should customize href by link_url_block option" do linked = @linker.auto_link("http://example.com/", :link_url_block => lambda{|a| "dummy"}) linked.should have_autolinked_url('dummy', 'http://example.com/') end it "should modify link attributes by link_attribute_block" do linked = @linker.auto_link("#hash @mention", :link_attribute_block => lambda{|entity, attributes| attributes[:"dummy-hash-attr"] = "test" if entity[:hashtag] } ) linked.should match(/]+hashtag[^>]+dummy-hash-attr=\"test\"[^>]+>/) linked.should_not match(/]+username[^>]+dummy-hash-attr=\"test\"[^>]+>/) linked.should_not match(/link_attribute_block/i) linked = @linker.auto_link("@mention http://twitter.com/", :link_attribute_block => lambda{|entity, attributes| attributes["dummy-url-attr"] = entity[:url] if entity[:url] } ) linked.should_not match(/]+username[^>]+dummy-url-attr=\"http:\/\/twitter.com\/\"[^>]*>/) linked.should match(/]+dummy-url-attr=\"http:\/\/twitter.com\/\"/) end it "should modify link text by link_text_block" do linked = @linker.auto_link("#hash @mention", :link_text_block => lambda{|entity, text| entity[:hashtag] ? "#replaced" : "pre_#{text}_post" } ) linked.should match(/]+>#replaced<\/a>/) linked.should match(/]+>pre_mention_post<\/a>/) linked = @linker.auto_link("#hash @mention", { :link_text_block => lambda{|entity, text| "pre_#{text}_post" }, :symbol_tag => "s", :text_with_symbol_tag => "b", :username_include_symbol => true }) linked.should match(/]+>pre_#<\/s>hash<\/b>_post<\/a>/) linked.should match(/]+>pre_@<\/s>mention<\/b>_post<\/a>/) end it "should apply :url_target only to auto-linked URLs" do auto_linked = @linker.auto_link("#hashtag @mention http://test.com/", {:url_target => '_blank'}) auto_linked.should have_autolinked_hashtag('#hashtag') auto_linked.should link_to_screen_name('mention') auto_linked.should have_autolinked_url('http://test.com/') auto_linked.should_not match(/]+hashtag[^>]+target[^>]+>/) auto_linked.should_not match(/]+username[^>]+target[^>]+>/) auto_linked.should match(/]+test.com[^>]+target=\"_blank\"[^>]*>/) end end describe "link_url_with_entity" do before do @linker = TestAutolink.new end it "should use display_url and expanded_url" do @linker.send(:link_url_with_entity, { :url => "http://t.co/abcde", :display_url => "twitter.com", :expanded_url => "http://twitter.com/"}, {:invisible_tag_attrs => "class='invisible'"}).gsub('"', "'").should == "twitter.com"; end it "should correctly handle display_url ending with '…'" do @linker.send(:link_url_with_entity, { :url => "http://t.co/abcde", :display_url => "twitter.com…", :expanded_url => "http://twitter.com/abcdefg"}, {:invisible_tag_attrs => "class='invisible'"}).gsub('"', "'").should == "twitter.com"; end it "should correctly handle display_url starting with '…'" do @linker.send(:link_url_with_entity, { :url => "http://t.co/abcde", :display_url => "…tter.com/abcdefg", :expanded_url => "http://twitter.com/abcdefg"}, {:invisible_tag_attrs => "class='invisible'"}).gsub('"', "'").should == "tter.com/abcdefg"; end it "should not create spans if display_url and expanded_url are on different domains" do @linker.send(:link_url_with_entity, { :url => "http://t.co/abcde", :display_url => "pic.twitter.com/xyz", :expanded_url => "http://twitter.com/foo/statuses/123/photo/1"}, {:invisible_tag_attrs => "class='invisible'"}).gsub('"', "'").should == "pic.twitter.com/xyz" end end describe "symbol_tag" do before do @linker = TestAutolink.new end it "should put :symbol_tag around symbol" do @linker.auto_link("@mention", {:symbol_tag => 's', :username_include_symbol=>true}).should match(/@<\/s>mention/) @linker.auto_link("#hash", {:symbol_tag => 's'}).should match(/#<\/s>hash/) result = @linker.auto_link("@mention #hash $CASH", {:symbol_tag => 'b', :username_include_symbol=>true}) result.should match(/@<\/b>mention/) result.should match(/#<\/b>hash/) result.should match(/\$<\/b>CASH/) end it "should put :text_with_symbol_tag around text" do result = @linker.auto_link("@mention #hash $CASH", {:text_with_symbol_tag => 'b'}) result.should match(/mention<\/b>/) result.should match(/hash<\/b>/) result.should match(/CASH<\/b>/) end it "should put :symbol_tag around symbol and :text_with_symbol_tag around text" do result = @linker.auto_link("@mention #hash $CASH", {:symbol_tag => 's', :text_with_symbol_tag => 'b', :username_include_symbol=>true}) result.should match(/@<\/s>mention<\/b>/) result.should match(/#<\/s>hash<\/b>/) result.should match(/\$<\/s>CASH<\/b>/) end end describe "html_escape" do before do @linker = TestAutolink.new end it "should escape html entities properly" do @linker.html_escape("&").should == "&" @linker.html_escape(">").should == ">" @linker.html_escape("<").should == "<" @linker.html_escape("\"").should == """ @linker.html_escape("'").should == "'" @linker.html_escape("&<>\"").should == "&<>"" @linker.html_escape("
").should == "<div>" @linker.html_escape("a&b").should == "a&b" @linker.html_escape("twitter & friends").should == "<a href="https://twitter.com" target="_blank">twitter & friends</a>" @linker.html_escape("&").should == "&amp;" @linker.html_escape(nil).should == nil end end end twitter-text-rb-1.7.0/spec/extractor_spec.rb000066400000000000000000000322271224076600300211160ustar00rootroot00000000000000# encoding: utf-8 require File.dirname(__FILE__) + '/spec_helper' class TestExtractor include Twitter::Extractor end describe Twitter::Extractor do before do @extractor = TestExtractor.new end describe "mentions" do context "single screen name alone " do it "should be linked" do @extractor.extract_mentioned_screen_names("@alice").should == ["alice"] end it "should be linked with _" do @extractor.extract_mentioned_screen_names("@alice_adams").should == ["alice_adams"] end it "should be linked if numeric" do @extractor.extract_mentioned_screen_names("@1234").should == ["1234"] end end context "multiple screen names" do it "should both be linked" do @extractor.extract_mentioned_screen_names("@alice @bob").should == ["alice", "bob"] end end context "screen names embedded in text" do it "should be linked in Latin text" do @extractor.extract_mentioned_screen_names("waiting for @alice to arrive").should == ["alice"] end it "should be linked in Japanese text" do @extractor.extract_mentioned_screen_names("の@aliceに到着を待っている").should == ["alice"] end it "should ignore mentions preceded by !, @, #, $, %, & or *" do invalid_chars = ['!', '@', '#', '$', '%', '&', '*'] invalid_chars.each do |c| @extractor.extract_mentioned_screen_names("f#{c}@kn").should == [] end end end it "should accept a block arugment and call it in order" do needed = ["alice", "bob"] @extractor.extract_mentioned_screen_names("@alice @bob") do |sn| sn.should == needed.shift end needed.should == [] end end describe "mentions with indices" do context "single screen name alone " do it "should be linked and the correct indices" do @extractor.extract_mentioned_screen_names_with_indices("@alice").should == [{:screen_name => "alice", :indices => [0, 6]}] end it "should be linked with _ and the correct indices" do @extractor.extract_mentioned_screen_names_with_indices("@alice_adams").should == [{:screen_name => "alice_adams", :indices => [0, 12]}] end it "should be linked if numeric and the correct indices" do @extractor.extract_mentioned_screen_names_with_indices("@1234").should == [{:screen_name => "1234", :indices => [0, 5]}] end end context "multiple screen names" do it "should both be linked with the correct indices" do @extractor.extract_mentioned_screen_names_with_indices("@alice @bob").should == [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}] end it "should be linked with the correct indices even when repeated" do @extractor.extract_mentioned_screen_names_with_indices("@alice @alice @bob").should == [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "alice", :indices => [7, 13]}, {:screen_name => "bob", :indices => [14, 18]}] end end context "screen names embedded in text" do it "should be linked in Latin text with the correct indices" do @extractor.extract_mentioned_screen_names_with_indices("waiting for @alice to arrive").should == [{:screen_name => "alice", :indices => [12, 18]}] end it "should be linked in Japanese text with the correct indices" do @extractor.extract_mentioned_screen_names_with_indices("の@aliceに到着を待っている").should == [{:screen_name => "alice", :indices => [1, 7]}] end end it "should accept a block arugment and call it in order" do needed = [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}] @extractor.extract_mentioned_screen_names_with_indices("@alice @bob") do |sn, start_index, end_index| data = needed.shift sn.should == data[:screen_name] start_index.should == data[:indices].first end_index.should == data[:indices].last end needed.should == [] end it "should extract screen name in text with supplementary character" do @extractor.extract_mentioned_screen_names_with_indices("#{[0x10400].pack('U')} @alice").should == [{:screen_name => "alice", :indices => [2, 8]}] end end describe "replies" do context "should be extracted from" do it "should extract from lone name" do @extractor.extract_reply_screen_name("@alice").should == "alice" end it "should extract from the start" do @extractor.extract_reply_screen_name("@alice reply text").should == "alice" end it "should extract preceded by a space" do @extractor.extract_reply_screen_name(" @alice reply text").should == "alice" end it "should extract preceded by a full-width space" do @extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text").should == "alice" end end context "should not be extracted from" do it "should not be extracted when preceded by text" do @extractor.extract_reply_screen_name("reply @alice text").should == nil end it "should not be extracted when preceded by puctuation" do %w(. / _ - + # ! @).each do |punct| @extractor.extract_reply_screen_name("#{punct}@alice text").should == nil end end end context "should accept a block arugment" do it "should call the block on match" do @extractor.extract_reply_screen_name("@alice") do |sn| sn.should == "alice" end end it "should not call the block on no match" do calls = 0 @extractor.extract_reply_screen_name("not a reply") do |sn| calls += 1 end calls.should == 0 end end end describe "urls" do describe "matching URLS" do TestUrls::VALID.each do |url| it "should extract the URL #{url} and prefix it with a protocol if missing" do @extractor.extract_urls(url).first.should include(url) end it "should match the URL #{url} when it's embedded in other text" do text = "Sweet url: #{url} I found. #awesome" @extractor.extract_urls(text).first.should include(url) end end end describe "invalid URLS" do it "does not link urls with invalid domains" do @extractor.extract_urls("http://tld-too-short.x").should == [] end end describe "t.co URLS" do TestUrls::TCO.each do |url| it "should only extract the t.co URL from the URL #{url}" do extracted_urls = @extractor.extract_urls(url) extracted_urls.size.should == 1 extracted_url = extracted_urls.first extracted_url.should_not == url extracted_url.should == url[0...20] end it "should match the t.co URL from the URL #{url} when it's embedded in other text" do text = "Sweet url: #{url} I found. #awesome" extracted_urls = @extractor.extract_urls(text) extracted_urls.size.should == 1 extracted_url = extracted_urls.first extracted_url.should_not == url extracted_url.should == url[0...20] end end end end describe "urls with indices" do describe "matching URLS" do TestUrls::VALID.each do |url| it "should extract the URL #{url} and prefix it with a protocol if missing" do extracted_urls = @extractor.extract_urls_with_indices(url) extracted_urls.size.should == 1 extracted_url = extracted_urls.first extracted_url[:url].should include(url) extracted_url[:indices].first.should == 0 extracted_url[:indices].last.should == url.chars.to_a.size end it "should match the URL #{url} when it's embedded in other text" do text = "Sweet url: #{url} I found. #awesome" extracted_urls = @extractor.extract_urls_with_indices(text) extracted_urls.size.should == 1 extracted_url = extracted_urls.first extracted_url[:url].should include(url) extracted_url[:indices].first.should == 11 extracted_url[:indices].last.should == 11 + url.chars.to_a.size end end it "should extract URL in text with supplementary character" do @extractor.extract_urls_with_indices("#{[0x10400].pack('U')} http://twitter.com").should == [{:url => "http://twitter.com", :indices => [2, 20]}] end end describe "invalid URLS" do it "does not link urls with invalid domains" do @extractor.extract_urls_with_indices("http://tld-too-short.x").should == [] end end describe "t.co URLS" do TestUrls::TCO.each do |url| it "should only extract the t.co URL from the URL #{url} and adjust indices correctly" do extracted_urls = @extractor.extract_urls_with_indices(url) extracted_urls.size.should == 1 extracted_url = extracted_urls.first extracted_url[:url].should_not include(url) extracted_url[:url].should include(url[0...20]) extracted_url[:indices].first.should == 0 extracted_url[:indices].last.should == 20 end it "should match the t.co URL from the URL #{url} when it's embedded in other text" do text = "Sweet url: #{url} I found. #awesome" extracted_urls = @extractor.extract_urls_with_indices(text) extracted_urls.size.should == 1 extracted_url = extracted_urls.first extracted_url[:url].should_not include(url) extracted_url[:url].should include(url[0...20]) extracted_url[:indices].first.should == 11 extracted_url[:indices].last.should == 31 end end end end describe "hashtags" do context "extracts latin/numeric hashtags" do %w(text text123 123text).each do |hashtag| it "should extract ##{hashtag}" do @extractor.extract_hashtags("##{hashtag}").should == [hashtag] end it "should extract ##{hashtag} within text" do @extractor.extract_hashtags("pre-text ##{hashtag} post-text").should == [hashtag] end end end context "international hashtags" do context "should allow accents" do %w(mañana café münchen).each do |hashtag| it "should extract ##{hashtag}" do @extractor.extract_hashtags("##{hashtag}").should == [hashtag] end it "should extract ##{hashtag} within text" do @extractor.extract_hashtags("pre-text ##{hashtag} post-text").should == [hashtag] end end it "should not allow the multiplication character" do @extractor.extract_hashtags("#pre#{Twitter::Unicode::U00D7}post").should == ["pre"] end it "should not allow the division character" do @extractor.extract_hashtags("#pre#{Twitter::Unicode::U00F7}post").should == ["pre"] end end end it "should not extract numeric hashtags" do @extractor.extract_hashtags("#1234").should == [] end it "should extract hashtag followed by punctuations" do @extractor.extract_hashtags("#test1: #test2; #test3\"").should == ["test1", "test2" ,"test3"] end end describe "hashtags with indices" do def match_hashtag_in_text(hashtag, text, offset = 0) extracted_hashtags = @extractor.extract_hashtags_with_indices(text) extracted_hashtags.size.should == 1 extracted_hashtag = extracted_hashtags.first extracted_hashtag[:hashtag].should == hashtag extracted_hashtag[:indices].first.should == offset extracted_hashtag[:indices].last.should == offset + hashtag.chars.to_a.size + 1 end def not_match_hashtag_in_text(text) extracted_hashtags = @extractor.extract_hashtags_with_indices(text) extracted_hashtags.size.should == 0 end context "extracts latin/numeric hashtags" do %w(text text123 123text).each do |hashtag| it "should extract ##{hashtag}" do match_hashtag_in_text(hashtag, "##{hashtag}") end it "should extract ##{hashtag} within text" do match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9) end end end context "international hashtags" do context "should allow accents" do %w(mañana café münchen).each do |hashtag| it "should extract ##{hashtag}" do match_hashtag_in_text(hashtag, "##{hashtag}") end it "should extract ##{hashtag} within text" do match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9) end end it "should not allow the multiplication character" do match_hashtag_in_text("pre", "#pre#{[0xd7].pack('U')}post", 0) end it "should not allow the division character" do match_hashtag_in_text("pre", "#pre#{[0xf7].pack('U')}post", 0) end end end it "should not extract numeric hashtags" do not_match_hashtag_in_text("#1234") end it "should extract hashtag in text with supplementary character" do match_hashtag_in_text("hashtag", "#{[0x10400].pack('U')} #hashtag", 2) end end end twitter-text-rb-1.7.0/spec/hithighlighter_spec.rb000066400000000000000000000060551224076600300221060ustar00rootroot00000000000000# encoding: utf-8 require File.dirname(__FILE__) + '/spec_helper' class TestHitHighlighter include Twitter::HitHighlighter end describe Twitter::HitHighlighter do describe "highlight" do before do @highlighter = TestHitHighlighter.new end context "with options" do before do @original = "Testing this hit highliter" @hits = [[13,16]] end it "should default to tags" do @highlighter.hit_highlight(@original, @hits).should == "Testing this hit highliter" end it "should allow tag override" do @highlighter.hit_highlight(@original, @hits, :tag => 'b').should == "Testing this hit highliter" end end context "without links" do before do @original = "Hey! this is a test tweet" end it "should return original when no hits are provided" do @highlighter.hit_highlight(@original).should == @original end it "should highlight one hit" do @highlighter.hit_highlight(@original, hits = [[5, 9]]).should == "Hey! this is a test tweet" end it "should highlight two hits" do @highlighter.hit_highlight(@original, hits = [[5, 9], [15, 19]]).should == "Hey! this is a test tweet" end it "should correctly highlight first-word hits" do @highlighter.hit_highlight(@original, hits = [[0, 3]]).should == "Hey! this is a test tweet" end it "should correctly highlight last-word hits" do @highlighter.hit_highlight(@original, hits = [[20, 25]]).should == "Hey! this is a test tweet" end end context "with links" do it "should highlight with a single link" do @highlighter.hit_highlight("@bcherry this was a test tweet", [[9, 13]]).should == "@bcherry this was a test tweet" end it "should highlight with link at the end" do @highlighter.hit_highlight("test test test", [[5, 9]]).should == "test test test" end it "should highlight with a link at the beginning" do @highlighter.hit_highlight("test test test", [[5, 9]]).should == "test test test" end it "should highlight an entire link" do @highlighter.hit_highlight("test test test", [[5, 9]]).should == "test test test" end it "should highlight within a link" do @highlighter.hit_highlight("test test test", [[6, 8]]).should == "test test test" end it "should highlight around a link" do @highlighter.hit_highlight("test test test", [[3, 11]]).should == "test test test" end it "should fail gracefully with bad hits" do @highlighter.hit_highlight("test test", [[5, 20]]).should == "test test" end it "should not mess up with touching tags" do @highlighter.hit_highlight("foofoo", [[3,6]]).should == "foofoo" end end end end twitter-text-rb-1.7.0/spec/regex_spec.rb000066400000000000000000000021321224076600300202050ustar00rootroot00000000000000# encoding: utf-8 require File.dirname(__FILE__) + '/spec_helper' describe "Twitter::Regex regular expressions" do describe "matching URLS" do TestUrls::VALID.each do |url| it "should match the URL #{url}" do url.should match_autolink_expression end it "should match the URL #{url} when it's embedded in other text" do text = "Sweet url: #{url} I found. #awesome" url.should match_autolink_expression_in(text) end end end describe "invalid URLS" do it "does not link urls with invalid characters" do TestUrls::INVALID.each {|url| url.should_not match_autolink_expression} end end describe "matching List names" do it "should match if less than 25 characters" do name = "Shuffleboard Community" name.length.should < 25 name.should match(Twitter::Regex::REGEXEN[:list_name]) end it "should not match if greater than 25 characters" do name = "Most Glorious Shady Meadows Shuffleboard Community" name.length.should > 25 name.should match(Twitter::Regex[:list_name]) end end end twitter-text-rb-1.7.0/spec/rewriter_spec.rb000066400000000000000000000400021224076600300207340ustar00rootroot00000000000000# encoding: utf-8 require File.dirname(__FILE__) + '/spec_helper' describe Twitter::Rewriter do def original_text; end def url; end def block(*args) if Array === @block_args unless Array === @block_args.first @block_args = [@block_args] end @block_args << args else @block_args = args end "[rewritten]" end describe "rewrite usernames" do #{{{ before do @rewritten_text = Twitter::Rewriter.rewrite_usernames_or_lists(original_text, &method(:block)) end context "username preceded by a space" do def original_text; "hello @jacob"; end it "should be rewritten" do @block_args.should == ["@", "jacob", nil] @rewritten_text.should == "hello [rewritten]" end end context "username at beginning of line" do def original_text; "@jacob you're cool"; end it "should be rewritten" do @block_args.should == ["@", "jacob", nil] @rewritten_text.should == "[rewritten] you're cool" end end context "username preceded by word character" do def original_text; "meet@the beach"; end it "should not be rewritten" do @block_args.should be_nil @rewritten_text.should == "meet@the beach" end end context "username preceded by non-word character" do def original_text; "great.@jacob"; end it "should be rewritten" do @block_args.should == ["@", "jacob", nil] @rewritten_text.should == "great.[rewritten]" end end context "username containing non-word characters" do def original_text; "@jacob&^$%^"; end it "should be rewritten" do @block_args.should == ["@", "jacob", nil] @rewritten_text.should == "[rewritten]&^$%^" end end context "username over twenty characters" do def original_text @twenty_character_username = "zach" * 5 "@" + @twenty_character_username + "1" end it "should be rewritten" do @block_args.should == ["@", @twenty_character_username, nil] @rewritten_text.should == "[rewritten]1" end end context "username followed by japanese" do def original_text; "@jacobの"; end it "should be rewritten" do @block_args.should == ["@", "jacob", nil] @rewritten_text.should == "[rewritten]の" end end context "username preceded by japanese" do def original_text; "あ@jacob"; end it "should be rewritten" do @block_args.should == ["@", "jacob", nil] @rewritten_text.should == "あ[rewritten]" end end context "username surrounded by japanese" do def original_text; "あ@jacobの"; end it "should be rewritten" do @block_args.should == ["@", "jacob", nil] @rewritten_text.should == "あ[rewritten]の" end end context "username using full-width at-sign" do def original_text "#{[0xFF20].pack('U')}jacob" end it "should be rewritten" do @block_args.should == ["@", "jacob", nil] @rewritten_text.should == "[rewritten]" end end end #}}} describe "rewrite lists" do #{{{ before do @rewritten_text = Twitter::Rewriter.rewrite_usernames_or_lists(original_text, &method(:block)) end context "slug preceded by a space" do def original_text; "hello @jacob/my-list"; end it "should be rewritten" do @block_args.should == ["@", "jacob", "/my-list"] @rewritten_text.should == "hello [rewritten]" end end context "username followed by a slash but no list" do def original_text; "hello @jacob/ my-list"; end it "should not be rewritten" do @block_args.should == ["@", "jacob", nil] @rewritten_text.should == "hello [rewritten]/ my-list" end end context "empty username followed by a list" do def original_text; "hello @/my-list"; end it "should not be rewritten" do @block_args.should be_nil @rewritten_text.should == "hello @/my-list" end end context "list slug at beginning of line" do def original_text; "@jacob/my-list"; end it "should be rewritten" do @block_args.should == ["@", "jacob", "/my-list"] @rewritten_text.should == "[rewritten]" end end context "username preceded by alpha-numeric character" do def original_text; "meet@jacob/my-list"; end it "should not be rewritten" do @block_args.should be_nil @rewritten_text.should == "meet@jacob/my-list" end end context "username preceded by non-word character" do def original_text; "great.@jacob/my-list"; end it "should be rewritten" do @block_args.should == ["@", "jacob", "/my-list"] @rewritten_text.should == "great.[rewritten]" end end context "username containing non-word characters" do def original_text; "@jacob/my-list&^$%^"; end it "should be rewritten" do @block_args.should == ["@", "jacob", "/my-list"] @rewritten_text.should == "[rewritten]&^$%^" end end context "username over twenty characters" do def original_text @twentyfive_character_list = "a" * 25 "@jacob/#{@twentyfive_character_list}12345" end it "should be rewritten" do @block_args.should == ["@", "jacob", "/#{@twentyfive_character_list}"] @rewritten_text.should == "[rewritten]12345" end end end #}}} describe "rewrite hashtags" do #{{{ before do @rewritten_text = Twitter::Rewriter.rewrite_hashtags(original_text, &method(:block)) end context "with an all numeric hashtag" do def original_text; "#123"; end it "should not be rewritten" do @block_args.should be_nil @rewritten_text.should == "#123" end end context "with a hashtag with alphanumeric characters" do def original_text; "#ab1d"; end it "should be rewritten" do @block_args.should == ["#", "ab1d"] @rewritten_text.should == "[rewritten]" end end context "with a hashtag with underscores" do def original_text; "#a_b_c_d"; end it "should be rewritten" do @block_args.should == ["#", "a_b_c_d"] @rewritten_text.should == "[rewritten]" end end context "with a hashtag that is preceded by a word character" do def original_text; "ab#cd"; end it "should not be rewritten" do @block_args.should be_nil @rewritten_text.should == "ab#cd" end end context "with a hashtag that starts with a number but has word characters" do def original_text; "#2ab"; end it "should be rewritten" do @block_args.should == ["#", "2ab"] @rewritten_text.should == "[rewritten]" end end context "with multiple valid hashtags" do def original_text; "I'm frickin' awesome #ab #cd #ef"; end it "rewrites each hashtag" do @block_args.should == [["#", "ab"], ["#", "cd"], ["#", "ef"]] @rewritten_text.should == "I'm frickin' awesome [rewritten] [rewritten] [rewritten]" end end context "with a hashtag preceded by a ." do def original_text; "ok, great.#abc"; end it "should be rewritten" do @block_args.should == ["#", "abc"] @rewritten_text.should == "ok, great.[rewritten]" end end context "with a hashtag preceded by a &" do def original_text; "&#nbsp;"; end it "should not be rewritten" do @block_args.should be_nil @rewritten_text.should == "&#nbsp;" end end context "with a hashtag that ends in an !" do def original_text; "#great!"; end it "should be rewritten, but should not include the !" do @block_args.should == ["#", "great"]; @rewritten_text.should == "[rewritten]!" end end context "with a hashtag followed by Japanese" do def original_text; "#twj_devの"; end it "should be rewritten" do @block_args.should == ["#", "twj_devの"]; @rewritten_text.should == "[rewritten]" end end context "with a hashtag preceded by a full-width space" do def original_text; "#{[0x3000].pack('U')}#twj_dev"; end it "should be rewritten" do @block_args.should == ["#", "twj_dev"]; @rewritten_text.should == " [rewritten]" end end context "with a hashtag followed by a full-width space" do def original_text; "#twj_dev#{[0x3000].pack('U')}"; end it "should be rewritten" do @block_args.should == ["#", "twj_dev"]; @rewritten_text.should == "[rewritten] " end end context "with a hashtag using full-width hash" do def original_text; "#{[0xFF03].pack('U')}twj_dev"; end it "should be rewritten" do @block_args.should == ["#", "twj_dev"]; @rewritten_text.should == "[rewritten]" end end context "with a hashtag containing an accented latin character" do def original_text # the hashtag is #éhashtag "##{[0x00e9].pack('U')}hashtag" end it "should be rewritten" do @block_args.should == ["#", "éhashtag"]; @rewritten_text.should == "[rewritten]" end end end #}}} describe "rewrite urls" do #{{{ def url; "http://www.google.com"; end before do @rewritten_text = Twitter::Rewriter.rewrite_urls(original_text, &method(:block)) end context "when embedded in plain text" do def original_text; "On my search engine #{url} I found good links."; end it "should be rewritten" do @block_args.should == [url]; @rewritten_text.should == "On my search engine [rewritten] I found good links." end end context "when surrounded by Japanese;" do def original_text; "いまなにしてる#{url}いまなにしてる"; end it "should be rewritten" do @block_args.should == [url]; @rewritten_text.should == "いまなにしてる[rewritten]いまなにしてる" end end context "with a path surrounded by parentheses;" do def original_text; "I found a neatness (#{url})"; end it "should be rewritten" do @block_args.should == [url]; @rewritten_text.should == "I found a neatness ([rewritten])" end context "when the URL ends with a slash;" do def url; "http://www.google.com/"; end it "should be rewritten" do @block_args.should == [url]; @rewritten_text.should == "I found a neatness ([rewritten])" end end context "when the URL has a path;" do def url; "http://www.google.com/fsdfasdf"; end it "should be rewritten" do @block_args.should == [url]; @rewritten_text.should == "I found a neatness ([rewritten])" end end end context "when path contains parens" do def original_text; "I found a neatness (#{url})"; end it "should be rewritten" do @block_args.should == [url]; @rewritten_text.should == "I found a neatness ([rewritten])" end context "wikipedia" do def url; "http://en.wikipedia.org/wiki/Madonna_(artist)"; end it "should be rewritten" do @block_args.should == [url]; @rewritten_text.should == "I found a neatness ([rewritten])" end end context "IIS session" do def url; "http://msdn.com/S(deadbeef)/page.htm"; end it "should be rewritten" do @block_args.should == [url]; @rewritten_text.should == "I found a neatness ([rewritten])" end end context "unbalanced parens" do def url; "http://example.com/i_has_a_("; end it "should be rewritten" do @block_args.should == ["http://example.com/i_has_a_"]; @rewritten_text.should == "I found a neatness ([rewritten]()" end end context "balanced parens with a double quote inside" do def url; "http://foo.bar.com/foo_(\")_bar" end it "should be rewritten" do @block_args.should == ["http://foo.bar.com/foo_"]; @rewritten_text.should == "I found a neatness ([rewritten](\")_bar)" end end context "balanced parens hiding XSS" do def url; 'http://x.xx.com/("style="color:red"onmouseover="alert(1)' end it "should be rewritten" do @block_args.should == ["http://x.xx.com/"]; @rewritten_text.should == 'I found a neatness ([rewritten]("style="color:red"onmouseover="alert(1))' end end end context "when preceded by a :" do def original_text; "Check this out @hoverbird:#{url}"; end it "should be rewritten" do @block_args.should == [url]; @rewritten_text.should == "Check this out @hoverbird:[rewritten]" end end context "with a URL ending in allowed punctuation" do it "does not consume ending punctuation" do %w| ? ! , . : ; ] ) } = \ ' |.each do |char| Twitter::Rewriter.rewrite_urls("#{url}#{char}") do |url| url.should == url; "[rewritten]" end.should == "[rewritten]#{char}" end end end context "with a URL preceded in forbidden characters" do it "should be rewritten" do %w| \ ' / ! = |.each do |char| Twitter::Rewriter.rewrite_urls("#{char}#{url}") do |url| "[rewritten]" # should not be called here. end.should == "#{char}[rewritten]" end end end context "when embedded in a link tag" do def original_text; "#{url}"; end it "should be rewritten" do @block_args.should == [url]; @rewritten_text.should == "[rewritten]" end end context "with multiple URLs" do def original_text; "http://www.links.org link at start of page, link at end http://www.foo.org"; end it "should autolink each one" do @block_args.should == [["http://www.links.org"], ["http://www.foo.org"]]; @rewritten_text.should == "[rewritten] link at start of page, link at end [rewritten]" end end context "with multiple URLs in different formats" do def original_text; "http://foo.com https://bar.com http://mail.foobar.org"; end it "should autolink each one, in the proper order" do @block_args.should == [["http://foo.com"], ["https://bar.com"], ["http://mail.foobar.org"]]; @rewritten_text.should == "[rewritten] [rewritten] [rewritten]" end end context "with a URL having a long TLD" do def original_text; "Yahoo integriert Facebook http://golem.mobi/0912/71607.html"; end it "should autolink it" do @block_args.should == ["http://golem.mobi/0912/71607.html"] @rewritten_text.should == "Yahoo integriert Facebook [rewritten]" end end context "with a url lacking the protocol" do def original_text; "I like www.foobar.com dudes"; end it "does not link at all" do @block_args.should be_nil @rewritten_text.should == "I like www.foobar.com dudes" end end context "with a @ in a URL" do context "with XSS attack" do def original_text; 'http://x.xx.com/@"style="color:pink"onmouseover=alert(1)//'; end it "should not allow XSS follwing @" do @block_args.should == ["http://x.xx.com/"] @rewritten_text.should == '[rewritten]@"style="color:pink"onmouseover=alert(1)//' end end context "with a username not followed by a /" do def original_text; "http://example.com/@foobar"; end it "should link url" do @block_args.should == ["http://example.com/@foobar"] @rewritten_text.should == "[rewritten]" end end context "with a username followed by a /" do def original_text; "http://example.com/@foobar/"; end it "should not link the username but link full url" do @block_args.should == ["http://example.com/@foobar/"] @rewritten_text.should == "[rewritten]" end end end end #}}} end # vim: foldmethod=marker twitter-text-rb-1.7.0/spec/spec_helper.rb000066400000000000000000000076021224076600300203610ustar00rootroot00000000000000$TESTING=true # Ruby 1.8 encoding check major, minor, patch = RUBY_VERSION.split('.') if major.to_i == 1 && minor.to_i < 9 $KCODE='u' end $:.push File.join(File.dirname(__FILE__), '..', 'lib') require 'nokogiri' require 'json' require 'simplecov' SimpleCov.start do add_group 'Libraries', 'lib' end require File.expand_path('../../lib/twitter-text', __FILE__) require File.expand_path('../test_urls', __FILE__) RSpec.configure do |config| config.include TestUrls end RSpec::Matchers.define :match_autolink_expression do match do |string| !Twitter::Extractor.extract_urls(string).empty? end end RSpec::Matchers.define :match_autolink_expression_in do |text| match do |url| @match_data = Twitter::Regex[:valid_url].match(text) @match_data && @match_data.to_s.strip == url end failure_message_for_should do |url| "Expected to find url '#{url}' in text '#{text}', but the match was #{@match_data.captures}'" end end RSpec::Matchers.define :have_autolinked_url do |url, inner_text| match do |text| @link = Nokogiri::HTML(text).search("a[@href='#{url}']") @link && @link.inner_text && (inner_text && @link.inner_text == inner_text) || (!inner_text && @link.inner_text == url) end failure_message_for_should do |text| "Expected url '#{url}'#{", inner_text '#{inner_text}'" if inner_text} to be autolinked in '#{text}'" end end RSpec::Matchers.define :link_to_screen_name do |screen_name, inner_text| expected = inner_text ? inner_text : screen_name match do |text| @link = Nokogiri::HTML(text).search("a.username") @link && @link.inner_text == expected && "https://twitter.com/#{screen_name}".downcase.should == @link.first['href'] end failure_message_for_should do |text| if @link.first "Expected link '#{@link.inner_text}' with href '#{@link.first['href']}' to match screen_name '#{expected}', but it does not." else "Expected screen name '#{screen_name}' to be autolinked in '#{text}', but no link was found." end end failure_message_for_should_not do |text| "Expected link '#{@link.inner_text}' with href '#{@link.first['href']}' not to match screen_name '#{expected}', but it does." end description do "contain a link with the name and href pointing to the expected screen_name" end end RSpec::Matchers.define :link_to_list_path do |list_path, inner_text| expected = inner_text ? inner_text : list_path match do |text| @link = Nokogiri::HTML(text).search("a.list-slug") @link && @link.inner_text == expected && "https://twitter.com/#{list_path}".downcase.should == @link.first['href'] end failure_message_for_should do |text| if @link.first "Expected link '#{@link.inner_text}' with href '#{@link.first['href']}' to match the list path '#{expected}', but it does not." else "Expected list path '#{list_path}' to be autolinked in '#{text}', but no link was found." end end failure_message_for_should_not do |text| "Expected link '#{@link.inner_text}' with href '#{@link.first['href']}' not to match the list path '#{expected}', but it does." end description do "contain a link with the list title and an href pointing to the list path" end end RSpec::Matchers.define :have_autolinked_hashtag do |hashtag| match do |text| @link = Nokogiri::HTML(text).search("a[@href='https://twitter.com/#!/search?q=#{hashtag.sub(/^#/, '%23')}']") @link && @link.inner_text && @link.inner_text == hashtag end failure_message_for_should do |text| if @link.first "Expected link text to be [#{hashtag}], but it was [#{@link.inner_text}] in #{text}" else "Expected hashtag #{hashtag} to be autolinked in '#{text}', but no link was found." end end failure_message_for_should_not do |text| "Expected link '#{@link.inner_text}' with href '#{@link.first['href']}' not to match the hashtag '#{hashtag}', but it does." end end twitter-text-rb-1.7.0/spec/test_urls.rb000066400000000000000000000050061224076600300201100ustar00rootroot00000000000000# encoding: utf-8 module TestUrls VALID = [ "http://google.com", "http://foobar.com/#", "http://google.com/#foo", "http://google.com/#search?q=iphone%20-filter%3Alinks", "http://twitter.com/#search?q=iphone%20-filter%3Alinks", "http://somedomain.com/index.php?path=/abc/def/", "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html", "http://somehost.com:3000", "http://xo.com/~matthew+%-x", "http://en.wikipedia.org/wiki/Primer_(film)", "http://www.ams.org/bookstore-getitem/item=mbk-59", "http://chilp.it/?77e8fd", "http://tell.me/why", "http://longtlds.info", "http://✪df.ws/ejp", "http://日本.com", "http://search.twitter.com/search?q=avro&lang=en", "http://mrs.domain-dash.biz", "http://x.com/has/one/char/domain", "http://t.co/nwcLTFF", "http://sub_domain-dash.twitter.com", "http://a.b.cd", "http://a_b.c-d.com", "http://a-b.b.com", "http://twitter-dash.com", "http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx", "www.foobar.com", "WWW.FOOBAR.COM", "www.foobar.co.jp", "http://t.co", "t.co/nwcLTFF" ] unless defined?(TestUrls::VALID) INVALID = [ "http://no-tld", "http://tld-too-short.x", "http://-doman_dash.com", "http://_leadingunderscore.twitter.com", "http://trailingunderscore_.twitter.com", "http://-leadingdash.twitter.com", "http://trailingdash-.twitter.com", "http://-leadingdash.com", "http://trailingdash-.com", "http://no_underscores.com", "http://test.c_o_m", "http://test.c-o-m", "http://twitt#{[0x202A].pack('U')}er.com", "http://twitt#{[0x202B].pack('U')}er.com", "http://twitt#{[0x202C].pack('U')}er.com", "http://twitt#{[0x202D].pack('U')}er.com", "http://twitt#{[0x202E].pack('U')}er.com" ] unless defined?(TestUrls::INVALID) TCO = [ "http://t.co/P53cv5yO!", "http://t.co/fQJmiPGg***", "http://t.co/pbY2NfTZ's", "http://t.co/2vYHpAc5;", "http://t.co/ulYGBYSo:", "http://t.co/GeT4bSiw=win", "http://t.co/8MkmHU0k+fun", "http://t.co/TKLp64dY.yes,", "http://t.co/8vuO27cI$$", "http://t.co/rPYTvdA8/", "http://t.co/WvtMw5ku%", "http://t.co/8t7G3ddS#", "http://t.co/nfHNJDV2/#!", "http://t.co/gK6NOXHs[good]", "http://t.co/dMrT0o1Y]bad", "http://t.co/FNkPfmii-", "http://t.co/sMgS3pjI_oh", "http://t.co/F8Dq3Plb~", "http://t.co/ivvH58vC&help", "http://t.co/iUBL15zD|NZ5KYLQ8" ] unless defined?(TestUrls::TCO) end twitter-text-rb-1.7.0/spec/twitter_text_spec.rb000066400000000000000000000006311224076600300216430ustar00rootroot00000000000000# encoding: utf-8 require File.dirname(__FILE__) + '/spec_helper' major, minor, patch = RUBY_VERSION.split('.') if major.to_i == 1 && minor.to_i < 9 describe "base" do before do $KCODE = 'NONE' end after do $KCODE = 'u' end it "should raise with invalid KCODE on Ruby < 1.9" do lambda do require 'twitter-text' end.should raise_error end end end twitter-text-rb-1.7.0/spec/unicode_spec.rb000066400000000000000000000016651224076600300205330ustar00rootroot00000000000000# encoding: utf-8 require File.dirname(__FILE__) + '/spec_helper' describe Twitter::Unicode do it "should lazy-init constants" do Twitter::Unicode.const_defined?(:UFEB6).should == false Twitter::Unicode::UFEB6.should_not be_nil Twitter::Unicode::UFEB6.should be_kind_of(String) Twitter::Unicode.const_defined?(:UFEB6).should == true end it "should return corresponding character" do Twitter::Unicode::UFEB6.should == [0xfeb6].pack('U') end it "should allow lowercase notation" do Twitter::Unicode::Ufeb6.should == Twitter::Unicode::UFEB6 Twitter::Unicode::Ufeb6.should === Twitter::Unicode::UFEB6 end it "should allow underscore notation" do Twitter::Unicode::U_FEB6.should == Twitter::Unicode::UFEB6 Twitter::Unicode::U_FEB6.should === Twitter::Unicode::UFEB6 end it "should raise on invalid codepoints" do lambda { Twitter::Unicode::FFFFFF }.should raise_error(NameError) end end twitter-text-rb-1.7.0/spec/validation_spec.rb000066400000000000000000000030231224076600300212250ustar00rootroot00000000000000# encoding: binary require File.dirname(__FILE__) + '/spec_helper' class TestValidation include Twitter::Validation end describe Twitter::Validation do it "should disallow invalid BOM character" do TestValidation.new.tweet_invalid?("Bom:#{Twitter::Unicode::UFFFE}").should == :invalid_characters TestValidation.new.tweet_invalid?("Bom:#{Twitter::Unicode::UFEFF}").should == :invalid_characters end it "should disallow invalid U+FFFF character" do TestValidation.new.tweet_invalid?("Bom:#{Twitter::Unicode::UFFFF}").should == :invalid_characters end it "should disallow direction change characters" do [0x202A, 0x202B, 0x202C, 0x202D, 0x202E].map{|cp| [cp].pack('U') }.each do |char| TestValidation.new.tweet_invalid?("Invalid:#{char}").should == :invalid_characters end end it "should disallow non-Unicode" do TestValidation.new.tweet_invalid?("not-Unicode:\xfff0").should == :invalid_characters end it "should allow <= 140 combined accent characters" do char = [0x65, 0x0301].pack('U') TestValidation.new.tweet_invalid?(char * 139).should == false TestValidation.new.tweet_invalid?(char * 140).should == false TestValidation.new.tweet_invalid?(char * 141).should == :too_long end it "should allow <= 140 multi-byte characters" do char = [ 0x1d106 ].pack('U') TestValidation.new.tweet_invalid?(char * 139).should == false TestValidation.new.tweet_invalid?(char * 140).should == false TestValidation.new.tweet_invalid?(char * 141).should == :too_long end end twitter-text-rb-1.7.0/test/000077500000000000000000000000001224076600300155635ustar00rootroot00000000000000twitter-text-rb-1.7.0/test/conformance_test.rb000066400000000000000000000142451224076600300214470ustar00rootroot00000000000000require 'multi_json' require 'nokogiri' require 'test/unit' require 'yaml' # Ruby 1.8 encoding check major, minor, patch = RUBY_VERSION.split('.') if major.to_i == 1 && minor.to_i < 9 $KCODE='u' end require File.expand_path('../../lib/twitter-text', __FILE__) class ConformanceTest < Test::Unit::TestCase include Twitter::Extractor include Twitter::Autolink include Twitter::HitHighlighter include Twitter::Validation private %w(description expected text json hits).each do |key| define_method key.to_sym do @test_info[key] end end def assert_equal_without_attribute_order(expected, actual, failure_message = nil) assert_block(build_message(failure_message, " expected but was\n", expected, actual)) do equal_nodes?(Nokogiri::HTML(expected).root, Nokogiri::HTML(actual).root) end end def equal_nodes?(expected, actual) return false unless expected.name == actual.name return false unless ordered_attributes(expected) == ordered_attributes(actual) return false if expected.text? && actual.text? && expected.content != actual.content expected.children.each_with_index do |child, index| return false unless equal_nodes?(child, actual.children[index]) end true end def ordered_attributes(element) element.attribute_nodes.map{|attr| [attr.name, attr.value]}.sort end CONFORMANCE_DIR = ENV['CONFORMANCE_DIR'] || File.expand_path("../twitter-text-conformance", __FILE__) def self.def_conformance_test(file, test_type, &block) yaml = YAML.load_file(File.join(CONFORMANCE_DIR, file)) raise "No such test suite: #{test_type.to_s}" unless yaml["tests"][test_type.to_s] yaml["tests"][test_type.to_s].each do |test_info| name = :"test_#{test_type} #{test_info['description']}" define_method name do @test_info = test_info instance_eval(&block) end end end public # Extractor Conformance def_conformance_test("extract.yml", :replies) do assert_equal expected, extract_reply_screen_name(text), description end def_conformance_test("extract.yml", :mentions) do assert_equal expected, extract_mentioned_screen_names(text), description end def_conformance_test("extract.yml", :mentions_with_indices) do e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} } assert_equal e, extract_mentioned_screen_names_with_indices(text), description end def_conformance_test("extract.yml", :mentions_or_lists_with_indices) do e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} } assert_equal e, extract_mentions_or_lists_with_indices(text), description end def_conformance_test("extract.yml", :urls) do assert_equal expected, extract_urls(text), description expected.each do |expected_url| assert_equal true, valid_url?(expected_url, true, false), "expected url [#{expected_url}] not valid" end end def_conformance_test("extract.yml", :urls_with_indices) do e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} } assert_equal e, extract_urls_with_indices(text), description end def_conformance_test("extract.yml", :hashtags) do assert_equal expected, extract_hashtags(text), description end def_conformance_test("extract.yml", :hashtags_with_indices) do e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} } assert_equal e, extract_hashtags_with_indices(text), description end def_conformance_test("extract.yml", :cashtags) do assert_equal expected, extract_cashtags(text), description end def_conformance_test("extract.yml", :cashtags_with_indices) do e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} } assert_equal e, extract_cashtags_with_indices(text), description end # Autolink Conformance def_conformance_test("autolink.yml", :usernames) do assert_equal_without_attribute_order expected, auto_link_usernames_or_lists(text, :suppress_no_follow => true), description end def_conformance_test("autolink.yml", :lists) do assert_equal_without_attribute_order expected, auto_link_usernames_or_lists(text, :suppress_no_follow => true), description end def_conformance_test("autolink.yml", :urls) do assert_equal_without_attribute_order expected, auto_link_urls(text, :suppress_no_follow => true), description end def_conformance_test("autolink.yml", :hashtags) do assert_equal_without_attribute_order expected, auto_link_hashtags(text, :suppress_no_follow => true), description end def_conformance_test("autolink.yml", :cashtags) do assert_equal_without_attribute_order expected, auto_link_cashtags(text, :suppress_no_follow => true), description end def_conformance_test("autolink.yml", :all) do assert_equal_without_attribute_order expected, auto_link(text, :suppress_no_follow => true), description end def_conformance_test("autolink.yml", :json) do assert_equal_without_attribute_order expected, auto_link_with_json(text, MultiJson.load(json), :suppress_no_follow => true), description end # HitHighlighter Conformance def_conformance_test("hit_highlighting.yml", :plain_text) do assert_equal expected, hit_highlight(text, hits), description end def_conformance_test("hit_highlighting.yml", :with_links) do assert_equal expected, hit_highlight(text, hits), description end # Validation Conformance def_conformance_test("validate.yml", :tweets) do assert_equal expected, valid_tweet_text?(text), description end def_conformance_test("validate.yml", :usernames) do assert_equal expected, valid_username?(text), description end def_conformance_test("validate.yml", :lists) do assert_equal expected, valid_list?(text), description end def_conformance_test("validate.yml", :urls) do assert_equal expected, valid_url?(text), description end def_conformance_test("validate.yml", :urls_without_protocol) do assert_equal expected, valid_url?(text, true, false), description end def_conformance_test("validate.yml", :hashtags) do assert_equal expected, valid_hashtag?(text), description end def_conformance_test("validate.yml", :lengths) do assert_equal expected, tweet_length(text), description end end twitter-text-rb-1.7.0/test/twitter-text-conformance/000077500000000000000000000000001224076600300225375ustar00rootroot00000000000000twitter-text-rb-1.7.0/twitter-text.gemspec000066400000000000000000000024211224076600300206340ustar00rootroot00000000000000# encoding: utf-8 Gem::Specification.new do |s| s.name = "twitter-text" s.version = "1.7.0" s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle", "Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii", "James Koval"] s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com", "raffi@twitter.com", "jcummins@twitter.com", "niw@niw.at", "keita@twitter.com", "jkoval@twitter.com"] s.homepage = "http://twitter.com" s.description = s.summary = "A gem that provides text handling for Twitter" s.license = "Apache 2.0" s.platform = Gem::Platform::RUBY s.has_rdoc = true s.summary = "Twitter text handling library" s.add_development_dependency "multi_json", "~> 1.3" s.add_development_dependency "nokogiri", "~> 1.5.10" s.add_development_dependency "rake" s.add_development_dependency "rdoc" s.add_development_dependency "rspec" s.add_development_dependency "simplecov" s.add_runtime_dependency "unf", "~> 0.1.0" s.files = `git ls-files`.split("\n") s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n") s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) } s.require_paths = ["lib"] end