pax_global_header 0000666 0000000 0000000 00000000064 12715454311 0014515 g ustar 00root root 0000000 0000000 52 comment=863fb3b0a4ab2f78368867dfebde2e25477a6ae6
spider-0.5.0/ 0000775 0000000 0000000 00000000000 12715454311 0013005 5 ustar 00root root 0000000 0000000 spider-0.5.0/.gitignore 0000664 0000000 0000000 00000000077 12715454311 0015001 0 ustar 00root root 0000000 0000000
*.gem
/Gemfile.lock
/.bundle
/vendor
/doc
/pkg
/rdoc
/.yardoc
spider-0.5.0/.rubocop.yml 0000664 0000000 0000000 00000000106 12715454311 0015254 0 ustar 00root root 0000000 0000000 Style/Documentation:
Enabled: false
Metrics/LineLength:
Max: 200
spider-0.5.0/AUTHORS 0000664 0000000 0000000 00000000354 12715454311 0014057 0 ustar 00root root 0000000 0000000 The Ruby Spider Gem would not be what it is today without the help of
the following kind souls:
Brian Campbell
Henri Cook
James Edward Gray II
Joao Eriberto Mota Filho
John Buckley
John Nagro
Mike Burns
Matt Horan
Sander van der Vliet
spider-0.5.0/CHANGES 0000664 0000000 0000000 00000003134 12715454311 0014001 0 ustar 00root root 0000000 0000000 2016-05-13
* fixed #1 thanks to @eribertomota
* got it running on more recent versions of ruby
* cleaned up the docs a bit
* cleaned up the licensing and attribution
2009-05-21
* fixed an issue with robots.txt on ssl hosts
* fixed an issue with pulling robots.txt from disallowed hosts
* fixed a documentation error with ExpiredLinks
* Many thanks to Brian Campbell
2008-10-09
* fixed a situation with nested slashes in urls, thanks to Sander van der Vliet and John Buckley
2008-07-06
* Trap interrupts and shutdown gracefully
* Support for custom urls-to-crawl objects
* Example AmazonSQS urls-to-crawl support (next_urls_in_sqs.rb)
2007-11-09:
* Handle redirects that assume a base URL.
2007-11-08:
* Move spider_instance.rb, robot_rules.rb, and included_in_memcached.rb into
spider subdirectory.
2007-11-02:
* Memcached support.
2007-10-31:
* Add `setup' and `teardown' handlers.
* Can set the headers for a HTTP request.
* Changed :any to :every .
* Changed the arguments to the :every, :success, :failure, and code handler.
2007-10-23:
* URLs without a page component but with a query component.
* HTTP Redirect.
* HTTPS.
* Version 0.2.1 .
2007-10-22:
* Use RSpec to ensure that it mostly works.
* Use WEBrick to create a small test server for additional testing.
* Completely re-do the API to prepare for future expansion.
* Add the ability to apply each URL to a series of custom allowed?-like
matchers.
* BSD license.
* Version 0.2.0 .
2007-03-30:
* Clean up the documentation.
2007-03-28:
* Change the tail recursion to a `while' loop, to please Ruby.
* Documentation.
* Initial release: version 0.1.0 .
spider-0.5.0/LICENSE 0000664 0000000 0000000 00000002103 12715454311 0014006 0 ustar 00root root 0000000 0000000 The MIT License (MIT)
Copyright (c) 2007-2016 Spider Team Authors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
spider-0.5.0/README.md 0000664 0000000 0000000 00000006132 12715454311 0014266 0 ustar 00root root 0000000 0000000
# Spider
_a Web spidering library for Ruby. It handles the robots.txt,
scraping, collecting, and looping so that you can just handle the data._
## Examples
### Crawl the Web, loading each page in turn, until you run out of memory
```ruby
require 'spider'
Spider.start_at('http://cashcats.biz/') {}
```
### To handle erroneous responses
```ruby
require 'spider'
Spider.start_at('http://cashcats.biz/') do |s|
s.on :failure do |a_url, resp, prior_url|
puts "URL failed: #{a_url}"
puts " linked from #{prior_url}"
end
end
```
### Or handle successful responses
```ruby
require 'spider'
Spider.start_at('http://cashcats.biz/') do |s|
s.on :success do |a_url, resp, prior_url|
puts "#{a_url}: #{resp.code}"
puts resp.body
puts
end
end
```
### Limit to just one domain
```ruby
require 'spider'
Spider.start_at('http://cashcats.biz/') do |s|
s.add_url_check do |a_url|
a_url =~ %r{^http://cashcats.biz.*}
end
end
```
### Pass headers to some requests
```ruby
require 'spider'
Spider.start_at('http://cashcats.biz/') do |s|
s.setup do |a_url|
if a_url =~ %r{^http://.*wikipedia.*}
headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
end
end
end
```
### Use memcached to track cycles
```ruby
require 'spider'
require 'spider/included_in_memcached'
SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
Spider.start_at('http://cashcats.biz/') do |s|
s.check_already_seen_with IncludedInMemcached.new(SERVERS)
end
```
### Track cycles with a custom object
```ruby
require 'spider'
class ExpireLinks < Hash
def <<(v)
self[v] = Time.now
end
def include?(v)
self[v].kind_of?(Time) && (self[v] + 86400) >= Time.now
end
end
Spider.start_at('http://cashcats.biz/') do |s|
s.check_already_seen_with ExpireLinks.new
end
```
### Store nodes to visit with Amazon SQS
```ruby
require 'spider'
require 'spider/next_urls_in_sqs'
Spider.start_at('http://cashcats.biz') do |s|
s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
end
```
### Store nodes to visit with a custom object
```ruby
require 'spider'
class MyArray < Array
def pop
super
end
def push(a_msg)
super(a_msg)
end
end
Spider.start_at('http://cashcats.biz') do |s|
s.store_next_urls_with MyArray.new
end
```
### Create a URL graph
```ruby
require 'spider'
nodes = {}
Spider.start_at('http://cashcats.biz/') do |s|
s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
s.on(:every) do |a_url, resp, prior_url|
nodes[prior_url] ||= []
nodes[prior_url] << a_url
end
end
```
### Use a proxy
```ruby
require 'net/http_configuration'
require 'spider'
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
:proxy_port => 8881)
http_conf.apply do
Spider.start_at('http://img.4chan.org/b/') do |s|
s.on(:success) do |a_url, resp, prior_url|
File.open(a_url.gsub('/',':'),'w') do |f|
f.write(resp.body)
end
end
end
end
```
spider-0.5.0/lib/ 0000775 0000000 0000000 00000000000 12715454311 0013553 5 ustar 00root root 0000000 0000000 spider-0.5.0/lib/spider.rb 0000664 0000000 0000000 00000002453 12715454311 0015372 0 ustar 00root root 0000000 0000000 require File.dirname(__FILE__)+'/spider/spider_instance'
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
# links, and doing it all over again.
class Spider
VERSION_INFO = [0, 5, 0] unless defined?(self::VERSION_INFO)
VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
def self.version
VERSION
end
# Runs the spider starting at the given URL. Also takes a block that is given
# the SpiderInstance. Use the block to define the rules and handlers for
# the discovered Web pages. See SpiderInstance for the possible rules and
# handlers.
#
# Spider.start_at('http://cashcats.biz/') do |s|
# s.add_url_check do |a_url|
# a_url =~ %r{^http://cashcats.biz.*}
# end
#
# s.on 404 do |a_url, resp, prior_url|
# puts "URL not found: #{a_url}"
# end
#
# s.on :success do |a_url, resp, prior_url|
# puts "body: #{resp.body}"
# end
#
# s.on :every do |a_url, resp, prior_url|
# puts "URL returned anything: #{a_url} with this code #{resp.code}"
# end
# end
def self.start_at(a_url, &block)
rules = RobotRules.new("Ruby Spider #{Spider::VERSION}")
a_spider = SpiderInstance.new({nil => [a_url]}, [], rules, [])
block.call(a_spider)
a_spider.start!
end
end
spider-0.5.0/lib/spider/ 0000775 0000000 0000000 00000000000 12715454311 0015041 5 ustar 00root root 0000000 0000000 spider-0.5.0/lib/spider/included_in_memcached.rb 0000664 0000000 0000000 00000001526 12715454311 0021635 0 ustar 00root root 0000000 0000000 # Use memcached to track cycles.
require 'memcache'
# A specialized class using memcached to track items stored. It supports
# three operations: new, <<, and include? . Together these can be used to
# add items to the memcache, then determine whether the item has been added.
#
# To use it with Spider use the check_already_seen_with method:
#
# Spider.start_at('http://example.com/') do |s|
# s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
# end
class IncludedInMemcached
# Construct a new IncludedInMemcached instance. All arguments here are
# passed to MemCache (part of the memcache-client gem).
def initialize(*a)
@c = MemCache.new(*a)
end
# Add an item to the memcache.
def <<(v)
@c.add(v.to_s, v)
end
# True if the item is in the memcache.
def include?(v)
@c.get(v.to_s) == v
end
end
spider-0.5.0/lib/spider/next_urls_in_sqs.rb 0000664 0000000 0000000 00000002533 12715454311 0020770 0 ustar 00root root 0000000 0000000 # Use AmazonSQS to track nodes to visit.
require 'rubygems'
require 'right_aws'
require 'yaml'
# A specialized class using AmazonSQS to track nodes to walk. It supports
# two operations: push and pop . Together these can be used to
# add items to the queue, then pull items off the queue.
#
# This is useful if you want multiple Spider processes crawling the same
# data set.
#
# To use it with Spider use the store_next_urls_with method:
#
# Spider.start_at('http://example.com/') do |s|
# s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
# end
class NextUrlsInSQS
# Construct a new NextUrlsInSQS instance. All arguments here are
# passed to RightAWS::SqsGen2 (part of the right_aws gem) or used
# to set the AmazonSQS queue name (optional).
def initialize(aws_access_key, aws_secret_access_key, queue_name = 'ruby-spider')
@sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
@queue = @sqs.queue(queue_name)
end
# Pull an item off the queue, loop until data is found. Data is
# encoded with YAML.
def pop
while true
message = @queue.pop
return YAML::load(message.to_s) unless message.nil?
sleep 5
end
end
# Put data on the queue. Data is encoded with YAML.
def push(a_msg)
encoded_message = YAML::dump(a_msg)
@queue.push(a_msg)
end
end
spider-0.5.0/lib/spider/robot_rules.rb 0000664 0000000 0000000 00000004244 12715454311 0017731 0 ustar 00root root 0000000 0000000 #!/usr/local/bin/ruby -w
# robot_rules.rb
#
# Created by James Edward Gray II on 2006-01-31.
# Copyright 2006 Gray Productions. All rights reserved.
# https://github.com/eribertomota/robot_rules.rb
# https://github.com/johnnagro/spider/issues/1
require "uri"
# Based on Perl's WWW::RobotRules module, by Gisle Aas.
class RobotRules
def initialize( user_agent )
@user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
"").downcase
@rules = Hash.new { |rules, rule| rules[rule] = Array.new }
end
def parse( text_uri, robots_data )
uri = URI.parse(text_uri)
location = "#{uri.host}:#{uri.port}"
@rules.delete(location)
rules = robots_data.split(/[\015\012]+/).
map { |rule| rule.sub(/\s*#.*$/, "") }
anon_rules = Array.new
my_rules = Array.new
current = anon_rules
rules.each do |rule|
case rule
when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
break unless my_rules.empty?
current = if $1 == "*"
anon_rules
elsif $1.downcase.index(@user_agent)
my_rules
else
nil
end
when /^\s*Disallow\s*:\s*(.*?)\s*$/i
next if current.nil?
if $1.empty?
current << nil
else
disallow = URI.parse($1)
next unless disallow.scheme.nil? or disallow.scheme ==
uri.scheme
next unless disallow.port.nil? or disallow.port == uri.port
next unless disallow.host.nil? or
disallow.host.downcase == uri.host.downcase
disallow = disallow.path
disallow = "/" if disallow.empty?
disallow = "/#{disallow}" unless disallow[0] == ?/
current << disallow
end
end
end
@rules[location] = if my_rules.empty?
anon_rules.compact
else
my_rules.compact
end
end
def allowed?( text_uri )
uri = URI.parse(text_uri)
location = "#{uri.host}:#{uri.port}"
path = uri.path
return true unless %w{http https}.include?(uri.scheme)
not @rules[location].any? { |rule| path.index(rule) == 0 }
end
end
spider-0.5.0/lib/spider/spider_instance.rb 0000664 0000000 0000000 00000021642 12715454311 0020545 0 ustar 00root root 0000000 0000000 # Specialized spidering rules.
require File.dirname(__FILE__)+'/robot_rules.rb'
require 'open-uri'
require 'uri'
require 'net/http'
require 'net/https'
module Net #:nodoc:
class HTTPResponse #:nodoc:
def success?; false; end
def redirect?; false; end
end
class HTTPSuccess #:nodoc:
def success?; true; end
end
class HTTPRedirection #:nodoc:
def redirect?; true; end
end
end
class NilClass #:nodoc:
def merge(h); h; end
end
class SpiderInstance
def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
@url_checks = []
@cache = :memory
@callbacks = {}
@next_urls = [next_urls]
@seen = seen
@rules = rules || RobotRules.new("Ruby Spider #{Spider::VERSION}")
@robots_seen = robots_seen
@headers = {}
@setup = nil
@teardown = nil
end
# Add a predicate that determines whether to continue down this URL's path.
# All predicates must be true in order for a URL to proceed.
#
# Takes a block that takes a string and produces a boolean. For example, this
# will ensure that the URL starts with 'http://cashcats.biz':
#
# add_url_check { |a_url| a_url =~ %r{^http://cashcats.biz.*}
def add_url_check(&block)
@url_checks << block
end
# The Web is a graph; to avoid cycles we store the nodes (URLs) already
# visited. The Web is a really, really, really big graph; as such, this list
# of visited nodes grows really, really, really big.
#
# Change the object used to store these seen nodes with this. The default
# object is an instance of Array. Available with Spider is a wrapper of
# memcached.
#
# You can implement a custom class for this; any object passed to
# check_already_seen_with must understand just << and included? .
#
# # default
# check_already_seen_with Array.new
#
# # memcached
# require 'spider/included_in_memcached'
# check_already_seen_with IncludedInMemcached.new('localhost:11211')
def check_already_seen_with(cacher)
if cacher.respond_to?(:<<) && cacher.respond_to?(:include?)
@seen = cacher
else
raise ArgumentError, 'expected something that responds to << and included?'
end
end
# The Web is a really, really, really big graph; as such, this list
# of nodes to visit grows really, really, really big.
#
# Change the object used to store nodes we have yet to walk. The default
# object is an instance of Array. Available with Spider is a wrapper of
# AmazonSQS.
#
# You can implement a custom class for this; any object passed to
# check_already_seen_with must understand just push and pop .
#
# # default
# store_next_urls_with Array.new
#
# # AmazonSQS
# require 'spider/next_urls_in_sqs'
# store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
def store_next_urls_with(a_store)
tmp_next_urls = @next_urls
@next_urls = a_store
tmp_next_urls.each do |a_url_hash|
@next_urls.push a_url_hash
end
end
# Add a response handler. A response handler's trigger can be :every,
# :success, :failure, or any HTTP status code. The handler itself can be
# either a Proc or a block.
#
# The arguments to the block are: the URL as a string, an instance of
# Net::HTTPResponse, and the prior URL as a string.
#
#
# For example:
#
# on 404 do |a_url, resp, prior_url|
# puts "URL not found: #{a_url}"
# end
#
# on :success do |a_url, resp, prior_url|
# puts a_url
# puts resp.body
# end
#
# on :every do |a_url, resp, prior_url|
# puts "Given this code: #{resp.code}"
# end
def on(code, p = nil, &block)
f = p ? p : block
case code
when Fixnum
@callbacks[code] = f
else
@callbacks[code.to_sym] = f
end
end
# Run before the HTTP request. Given the URL as a string.
# setup do |a_url|
# headers['Cookies'] = 'user_id=1;admin=true'
# end
def setup(p = nil, &block)
@setup = p ? p : block
end
# Run last, once for each page. Given the URL as a string.
def teardown(p = nil, &block)
@teardown = p ? p : block
end
# Use like a hash:
# headers['Cookies'] = 'user_id=1;password=btrross3'
def headers
HeaderSetter.new(self)
end
def raw_headers #:nodoc:
@headers
end
def raw_headers=(v) #:nodoc:
@headers = v
end
# Reset the headers hash.
def clear_headers
@headers = {}
end
def start! #:nodoc:
interrupted = false
trap("SIGINT") { interrupted = true }
begin
next_urls = @next_urls.pop
tmp_n_u = {}
next_urls.each do |prior_url, urls|
urls = [urls] unless urls.kind_of?(Array)
urls.map do |a_url|
[a_url, (URI.parse(a_url) rescue nil)]
end.select do |a_url, parsed_url|
allowable_url?(a_url, parsed_url)
end.each do |a_url, parsed_url|
@setup.call(a_url) unless @setup.nil?
get_page(parsed_url) do |response|
do_callbacks(a_url, response, prior_url)
#tmp_n_u[a_url] = generate_next_urls(a_url, response)
#@next_urls.push tmp_n_u
generate_next_urls(a_url, response).each do |a_next_url|
@next_urls.push a_url => a_next_url
end
#exit if interrupted
end
@teardown.call(a_url) unless @teardown.nil?
exit if interrupted
end
end
end while !@next_urls.empty?
end
def success_or_failure(code) #:nodoc:
if code > 199 && code < 300
:success
else
:failure
end
end
def allowable_url?(a_url, parsed_url) #:nodoc:
!parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
@url_checks.map{|url_check|url_check.call(a_url)}.all?
end
# True if the robots.txt for that URL allows access to it.
def allowed?(a_url, parsed_url) # :nodoc:
return false unless ['http','https'].include?(parsed_url.scheme)
u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
parsed_u = URI.parse(u)
return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all?
begin
unless @robots_seen.include?(u)
#open(u, 'User-Agent' => 'Ruby Spider',
# 'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url|
# @rules.parse(u, url.read)
#end
get_page(parsed_u) do |r|
@rules.parse(u, r.body)
end
@robots_seen << u
end
@rules.allowed?(a_url)
rescue OpenURI::HTTPError
true # No robots.txt
rescue Exception, Timeout::Error # to keep it from crashing
false
end
end
def get_page(parsed_url, &block) #:nodoc:
@seen << parsed_url
begin
http = Net::HTTP.new(parsed_url.host, parsed_url.port)
if parsed_url.scheme == 'https'
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
# Uses start because http.finish cannot be called.
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))}
if r.redirect?
get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
else
block.call(r)
end
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
p e
nil
end
end
def do_callbacks(a_url, resp, prior_url) #:nodoc:
cbs = [@callbacks[:every],
resp.success? ? @callbacks[:success] : @callbacks[:failure],
@callbacks[resp.code]]
cbs.each do |cb|
cb.call(a_url, resp, prior_url) if cb
end
end
def generate_next_urls(a_url, resp) #:nodoc:
web_page = resp.body
base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
[a_url[0,a_url.rindex('/')]])[0]
base_url = remove_trailing_slash(base_url)
web_page.scan(/href="(.*?)"/i).flatten.map do |link|
begin
parsed_link = URI.parse(link)
if parsed_link.fragment == '#'
nil
else
construct_complete_url(base_url, link, parsed_link)
end
rescue
nil
end
end.compact
end
def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
parsed_additional_url ||= URI.parse(additional_url)
case parsed_additional_url.scheme
when nil
u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
if additional_url[0].chr == '/'
"#{u.scheme}://#{u.host}#{additional_url}"
elsif u.path.nil? || u.path == ''
"#{u.scheme}://#{u.host}/#{additional_url}"
elsif u.path[0].chr == '/'
"#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
else
"#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
end
else
additional_url
end
end
def remove_trailing_slash(s) #:nodoc:
s.sub(%r{/*$},'')
end
class HeaderSetter #:nodoc:
def initialize(si)
@si = si
end
def []=(k,v)
@si.raw_headers = @si.raw_headers.merge({k => v})
end
end
end
spider-0.5.0/spec/ 0000775 0000000 0000000 00000000000 12715454311 0013737 5 ustar 00root root 0000000 0000000 spider-0.5.0/spec/spec_helper.rb 0000664 0000000 0000000 00000003356 12715454311 0016564 0 ustar 00root root 0000000 0000000 require 'rubygems'
require 'webrick'
require 'spec'
Spec::Runner.configure { |c| c.mock_with :mocha }
def local_require(*files)
files.each do |file|
require File.dirname(__FILE__)+'/../lib/'+file
end
end
class BeStaticServerPages
def initialize
@pages = ['http://localhost:8888/', 'http://localhost:8888/foo']
@actual = nil
end
attr :actual, true
def matches?(actual)
@actual = actual
actual == @pages
end
def failure_message
"expected #{@pages.inspect}, got #{@actual.inspect}"
end
def description
"be the pages returned by the static server (#{@pages.inspect})"
end
end
def with_web_server(svlt)
server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
:AccessLog => [])
server.mount('/', svlt)
Thread.new {server.start}
begin
yield
ensure
server.shutdown
end
end
def with_memcached
system('memcached -d -P /tmp/spider-memcached.pid')
cacher = IncludedInMemcached.new('localhost:11211')
begin
yield
ensure
system('kill -KILL `cat /tmp/spider-memcached.pid`')
end
end
def be_static_server_pages
BeStaticServerPages.new
end
class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
def do_GET(req, res)
res['Content-type'] = 'text/plain'
res.body = "response\n"
end
end
class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
def do_GET(req, res)
res['Content-type'] = 'text/html'
if req.path == '/foo'
res.body = <<-END
a
END
else
res.body = <<-END
b
END
end
end
end
def null_logger
l = stub
[:log, :fatal, :error, :warn , :info, :debug].each do |k|
l.stubs(k)
l.stubs("#{k}?".to_sym)
end
l
end
spider-0.5.0/spec/spider/ 0000775 0000000 0000000 00000000000 12715454311 0015225 5 ustar 00root root 0000000 0000000 spider-0.5.0/spec/spider/included_in_memcached_spec.rb 0000664 0000000 0000000 00000002001 12715454311 0023020 0 ustar 00root root 0000000 0000000 require File.dirname(__FILE__)+'/../spec_helper'
def before_specing_memcached
local_require 'spider/included_in_memcached'
system('memcached -d -P /tmp/spider-memcached.pid')
end
def after_specing_memcached
system('kill -KILL `cat /tmp/spider-memcached.pid`')
end
Spec::Runner.configure { |c| c.mock_with :mocha }
describe 'Object to halt cycles' do
before do
before_specing_memcached
end
it 'should understand <<' do
c = IncludedInMemcached.new('localhost:11211')
c.should respond_to(:<<)
end
it 'should understand included?' do
c = IncludedInMemcached.new('localhost:11211')
c.should respond_to(:include?)
end
it 'should produce false if the object is not included' do
c = IncludedInMemcached.new('localhost:11211')
c.include?('a').should be_false
end
it 'should produce true if the object is included' do
c = IncludedInMemcached.new('localhost:11211')
c << 'a'
c.include?('a').should be_true
end
after do
after_specing_memcached
end
end
spider-0.5.0/spec/spider/spider_instance_spec.rb 0000664 0000000 0000000 00000033276 12715454311 0021751 0 ustar 00root root 0000000 0000000 require File.dirname(__FILE__)+'/../spec_helper'
require 'webrick'
require 'webrick/https'
local_require 'spider', 'spider/included_in_memcached'
describe 'SpiderInstance' do
# http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete
# URL. Bug reported by Henri Cook.
it 'should construct a complete redirect URL' do
@response_called = false
redirected_resp = stub(:redirect? => true,
:[] => '/default.htm')
success_resp = stub(:redirect? => false)
http_req = stub(:request => true)
http_mock_redir = stub(:use_ssl= => true)
http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp)
http_mock_success = stub(:use_ssl= => true)
http_mock_success.stubs(:start).yields(http_req).returns(success_resp)
Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then.
returns(http_mock_success)
si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']})
si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp|
@response_called = true
end
@response_called.should be_true
end
it 'should prevent cycles with an IncludedInMemcached' do
with_memcached do
cacher = IncludedInMemcached.new('localhost:11211')
it_should_prevent_cycles_with(cacher)
end
end
it 'should prevent cycles with an Array' do
cacher = Array.new
it_should_prevent_cycles_with(cacher)
end
it 'should call the "setup" callback before loading the Web page' do
mock_successful_http
@on_called = false
@before_called = false
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.setup { |*a| @before_called = Time.now }
si.on(:every) { |*a| @on_called = Time.now }
si.start!
@on_called.should_not be_false
@before_called.should_not be_false
@before_called.should_not be_false
@before_called.should < @on_called
end
it 'should call the "teardown" callback after running all other callbacks' do
mock_successful_http
@on_called = false
@after_called = false
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(:every) { |*a| @on_called = Time.now }
si.teardown { |*a| @after_called = Time.now }
si.start!
@on_called.should_not be_false
@after_called.should_not be_false
@after_called.should_not be_false
@after_called.should > @on_called
end
it 'should pass headers set by a setup handler to the HTTP request' do
mock_successful_http
Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
si = SpiderInstance.new(nil => ['http://example.com/foo'])
si.stubs(:allowable_url?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.setup do |a_url|
si.headers['X-Header-Set'] = 'True'
end
si.teardown do |a_url|
si.clear_headers
end
si.start!
end
it 'should call the :every callback with the current URL, the response, and the prior URL' do
mock_successful_http
callback_arguments_on(:every)
end
it 'should call the :success callback with the current URL, the request, and the prior URL' do
mock_successful_http
callback_arguments_on(:success)
end
it 'should call the :failure callback with the current URL, the request, and the prior URL' do
mock_failed_http
callback_arguments_on(:failure)
end
it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
mock_failed_http
callback_arguments_on(404)
end
it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
mock_successful_http
callback_arguments_on(200)
end
# Bug reported by John Nagro, using the example source http://eons.com/
# had to change line 192; uses request_uri now instead of path.
it 'should handle query URLs without a path' do
u = 'http://localhost:8888?s=1'
u_p = URI.parse(u)
@block_called = false
with_web_server(QueryServlet) do
si = SpiderInstance.new({nil => [u]})
si.get_page(u_p) do
@block_called = true
end
end
@block_called.should be_true
end
# This solves a problem reported by John Nagro.
it 'should handle redirects' do
u = 'http://example.com/'
u_p = URI.parse(u)
@redirect_handled = false
mock_redirect_http
si = SpiderInstance.new({nil => [u]})
si.get_page(u_p) do
@redirect_handled = true
end
@redirect_handled.should be_true
end
it 'should handle HTTPS' do
u = 'https://localhost:10443/'
u_p = URI.parse(u)
@page_called = false
server = WEBrick::HTTPServer.new(:Port => 10443,
:Logger => null_logger,
:AccessLog => [],
:SSLEnable => true,
:SSLCertName => [["O", "ruby-lang.org"], ["OU", "sample"], ["CN", WEBrick::Utils::getservername]],
:SSLComment => 'Comment of some sort')
server.mount('/', QueryServlet)
Thread.new {server.start}
si = SpiderInstance.new({nil => [u]})
si.get_page(u_p) { @page_called = true }
server.shutdown
@page_called.should be_true
end
it 'should skip URLs when allowable_url? is false' do
u = 'http://example.com/'
u_p = URI.parse(u)
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
si = SpiderInstance.new({nil => [u]})
si.expects(:allowable_url?).with(u, u_p).returns(false)
si.expects(:get_page).times(0)
si.start!
end
it 'should not skip URLs when allowable_url? is true' do
u = 'http://example.com/'
u_p = URI.parse(u)
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
si = SpiderInstance.new({nil => [u]})
si.expects(:allowable_url?).with(u, u_p).returns(true)
si.expects(:get_page).with(URI.parse(u))
si.start!
end
it 'should disallow URLs when the robots.txt says to' do
robot_rules = stub
SpiderInstance.any_instance.expects(:open).
with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider',
'Accept' => 'text/html,text/xml,application/xml,text/plain').
yields(stub(:read => 'robots.txt content'))
robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
'robots.txt content')
robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
allowable = si.allowable_url?('http://example.com/',
URI.parse('http://example.com/'))
allowable.should be_false
end
it 'should disallow URLs when they fail any url_check' do
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.add_url_check { |a_url| false }
allowable = si.allowable_url?('http://example.com/',
URI.parse('http://example.com/'))
allowable.should be_false
end
it 'should support multiple url_checks' do
@first_url_check = false
@second_url_check = false
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.add_url_check do |a_url|
@first_url_check = true
true
end
si.add_url_check do |a_url|
@second_url_check = true
false
end
allowable = si.allowable_url?('http://example.com/',
URI.parse('http://example.com/'))
allowable.should be_false
@first_url_check.should be_true
@second_url_check.should be_true
end
it 'should avoid cycles' do
u = 'http://example.com/'
u_p = URI.parse(u)
si = SpiderInstance.new({nil => [u]}, [u_p])
si.stubs(:allowed?).returns(true)
allowable = si.allowable_url?(u, u_p)
allowable.should be_false
u_p.should_not be_nil
end
it 'should call the 404 handler for 404s' do
@proc_called = false
mock_failed_http
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(404) {|*a| @proc_called = true}
si.start!
@proc_called.should be_true
end
it 'should call the :success handler on success' do
@proc_called = false
mock_successful_http
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(:success) {|*a| @proc_called = true}
si.start!
@proc_called.should be_true
end
it 'should not call the :success handler on failure' do
@proc_called = false
mock_failed_http
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(:success) {|*a| @proc_called = true}
si.start!
@proc_called.should be_false
end
it 'should call the :success handler and the 200 handler on 200' do
@proc_200_called = false
@proc_success_called = false
mock_successful_http
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(:success) {|*a| @proc_success_called = true}
si.on(200) {|*a| @proc_200_called = true}
si.start!
@proc_200_called.should be_true
@proc_success_called.should be_true
end
it 'should not call the :failure handler on success' do
@proc_called = false
mock_successful_http
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(:failure) {|*a| @proc_called = true}
si.start!
@proc_called.should be_false
end
it 'should call the :failure handler on failure' do
@proc_called = false
mock_failed_http
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(:failure) {|*a| @proc_called = true}
si.start!
@proc_called.should be_true
end
it 'should call the :failure handler and the 404 handler on 404' do
@proc_404_called = false
@proc_failure_called = false
mock_failed_http
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(:failure) {|*a| @proc_failure_called = true}
si.on(404) {|*a| @proc_404_called = true}
si.start!
@proc_404_called.should be_true
@proc_failure_called.should be_true
end
it 'should call the :every handler even when a handler for the error code is defined' do
@any_called = false
mock_successful_http
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(:every) { |*a| @any_called = true }
si.on(202) {|*a|}
si.start!
@any_called.should be_true
end
it 'should support a block as a response handler' do
@proc_called = false
mock_successful_http
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(:every) { |*a| @proc_called = true }
si.start!
@proc_called.should be_true
end
it 'should support a proc as a response handler' do
@proc_called = false
mock_successful_http
si = SpiderInstance.new({nil => ['http://example.com/']})
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(:every, Proc.new { |*a| @proc_called = true })
si.start!
@proc_called.should be_true
end
def mock_http(http_req)
http_obj = mock(:use_ssl= => true)
http_obj.expects(:start).
yields(mock(:request => http_req)).returns(http_req)
Net::HTTP.expects(:new).returns(http_obj)
end
def mock_successful_http
http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body')
mock_http(http_req)
end
def mock_failed_http
http_req = stub(:redirect? => false, :success? => false, :code => 404)
mock_http(http_req)
end
def mock_redirect_http
http_req = stub(:redirect? => true, :success? => false, :code => 404)
http_req.expects(:[]).with('Location').returns('http://example.com/')
http_req2 = stub(:redirect? => false, :success? => true, :code => 200)
http_obj = mock(:use_ssl= => true)
http_obj.expects(:start).
yields(mock(:request => http_req)).returns(http_req)
http_obj2 = mock(:use_ssl= => true)
http_obj2.expects(:start).
yields(mock(:request => http_req2)).returns(http_req2)
Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
end
def callback_arguments_on(code)
si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
si.stubs(:allowed?).returns(true)
si.stubs(:generate_next_urls).returns([])
si.on(code) do |a_url, resp, prior_url|
a_url.should == 'http://example.com/'
resp.should_not be_nil
prior_url.should == 'http://foo.com/'
end
si.start!
end
def it_should_prevent_cycles_with(cacher)
u = 'http://localhost:8888/'
u_p = URI.parse(u)
u2 = 'http://localhost:8888/foo'
u_p2 = URI.parse(u2)
with_web_server(LoopingServlet) do
si = SpiderInstance.new(nil => [u])
si.check_already_seen_with cacher
si.start!
end
end
end
spider-0.5.0/spec/spider_spec.rb 0000664 0000000 0000000 00000001563 12715454311 0016571 0 ustar 00root root 0000000 0000000 require File.dirname(__FILE__)+'/spec_helper'
local_require 'spider', 'spider/included_in_memcached'
describe 'Spider' do
it 'should find two pages without cycles using defaults' do
u = []
with_web_server(LoopingServlet) do
u = find_pages_with_static_server
end
u.should be_static_server_pages
end
it 'should find two pages without cycles using memcached' do
u = []
with_web_server(LoopingServlet) do
with_memcached do
u = find_pages_with_static_server do |s|
s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
end
end
end
u.should be_static_server_pages
end
def find_pages_with_static_server(&block)
pages = []
Spider.start_at('http://localhost:8888/') do |s|
block.call(s) unless block.nil?
s.on(:every){ |u,r,p| pages << u }
end
pages
end
end
spider-0.5.0/spider.gemspec 0000664 0000000 0000000 00000001146 12715454311 0015642 0 ustar 00root root 0000000 0000000 require 'rubygems'
require File.expand_path('../lib/spider', __FILE__)
spec = Gem::Specification.new do |s|
s.author = 'John Nagro'
s.email = 'john.nagro@gmail.com'
s.license = 'MIT'
s.has_rdoc = true
s.homepage = 'https://github.com/johnnagro/spider'
s.name = 'spider'
s.rubyforge_project = 'spider'
s.summary = 'A Web spidering library'
s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
s.require_path = 'lib'
s.description = <<-EOF
A Web spidering library: handles robots.txt, scraping, finding more
links, and doing it all over again.
EOF
s.version = Spider::VERSION
end