ruby-spider-0.4.4/0000775000000000000000000000000011205312536012462 5ustar rootrootruby-spider-0.4.4/CHANGES0000664000000000000000000000266511205312536013466 0ustar rootroot2009-05-21 * fixed an issue with robots.txt on ssl hosts * fixed an issue with pulling robots.txt from disallowed hosts * fixed a documentation error with ExpiredLinks * Many thanks to Brian Campbell 2008-10-09 * fixed a situation with nested slashes in urls, thanks to Sander van der Vliet and John Buckley 2008-07-06 * Trap interrupts and shutdown gracefully * Support for custom urls-to-crawl objects * Example AmazonSQS urls-to-crawl support (next_urls_in_sqs.rb) 2007-11-09: * Handle redirects that assume a base URL. 2007-11-08: * Move spider_instance.rb, robot_rules.rb, and included_in_memcached.rb into spider subdirectory. 2007-11-02: * Memcached support. 2007-10-31: * Add `setup' and `teardown' handlers. * Can set the headers for a HTTP request. * Changed :any to :every . * Changed the arguments to the :every, :success, :failure, and code handler. 2007-10-23: * URLs without a page component but with a query component. * HTTP Redirect. * HTTPS. * Version 0.2.1 . 2007-10-22: * Use RSpec to ensure that it mostly works. * Use WEBrick to create a small test server for additional testing. * Completely re-do the API to prepare for future expansion. * Add the ability to apply each URL to a series of custom allowed?-like matchers. * BSD license. * Version 0.2.0 . 2007-03-30: * Clean up the documentation. 2007-03-28: * Change the tail recursion to a `while' loop, to please Ruby. * Documentation. * Initial release: version 0.1.0 . ruby-spider-0.4.4/doc/0000775000000000000000000000000011205312536013227 5ustar rootrootruby-spider-0.4.4/doc/fr_method_index.html0000664000000000000000000001051611205312536017256 0ustar rootroot Methods

Methods

<< (IncludedInMemcached)
add_url_check (SpiderInstance)
after_specing_memcached (spec/spider/included_in_memcached_spec.rb)
allowed? (RobotRules)
be_static_server_pages (spec/spec_helper.rb)
before_specing_memcached (spec/spider/included_in_memcached_spec.rb)
callback_arguments_on (spec/spider/spider_instance_spec.rb)
check_already_seen_with (SpiderInstance)
clear_headers (SpiderInstance)
description (BeStaticServerPages)
do_GET (QueryServlet)
do_GET (LoopingServlet)
failure_message (BeStaticServerPages)
find_pages_with_static_server (spec/spider_spec.rb)
headers (SpiderInstance)
include? (IncludedInMemcached)
it_should_prevent_cycles_with (spec/spider/spider_instance_spec.rb)
local_require (spec/spec_helper.rb)
matches? (BeStaticServerPages)
mock_failed_http (spec/spider/spider_instance_spec.rb)
mock_http (spec/spider/spider_instance_spec.rb)
mock_redirect_http (spec/spider/spider_instance_spec.rb)
mock_successful_http (spec/spider/spider_instance_spec.rb)
new (IncludedInMemcached)
new (NextUrlsInSQS)
new (BeStaticServerPages)
new (RobotRules)
null_logger (spec/spec_helper.rb)
on (SpiderInstance)
parse (RobotRules)
pop (NextUrlsInSQS)
push (NextUrlsInSQS)
setup (SpiderInstance)
start_at (Spider)
store_next_urls_with (SpiderInstance)
teardown (SpiderInstance)
with_memcached (spec/spec_helper.rb)
with_web_server (spec/spec_helper.rb)
ruby-spider-0.4.4/doc/files/0000775000000000000000000000000011205312536014331 5ustar rootrootruby-spider-0.4.4/doc/files/spec/0000775000000000000000000000000011205312536015263 5ustar rootrootruby-spider-0.4.4/doc/files/spec/spider/0000775000000000000000000000000011205312536016551 5ustar rootrootruby-spider-0.4.4/doc/files/spec/spider/included_in_memcached_spec_rb.html0000664000000000000000000000663311205312536025407 0ustar rootroot File: included_in_memcached_spec.rb

included_in_memcached_spec.rb

Path: spec/spider/included_in_memcached_spec.rb
Last Update: Thu May 21 13:19:06 +0000 2009

Methods

Public Instance methods

[Validate]

ruby-spider-0.4.4/doc/files/spec/spider/included_in_memcached_spec_rb.src/0000775000000000000000000000000011205312536025277 5ustar rootrootruby-spider-0.4.4/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html0000664000000000000000000000175311205312536027035 0ustar rootroot before_specing_memcached (spec/spider/included_in_memcached_spec.rb)
# File spec/spider/included_in_memcached_spec.rb, line 3
def before_specing_memcached
  local_require 'spider/included_in_memcached'
  system('memcached -d -P /tmp/spider-memcached.pid')
end
ruby-spider-0.4.4/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html0000664000000000000000000000156211205312536027034 0ustar rootroot after_specing_memcached (spec/spider/included_in_memcached_spec.rb)
# File spec/spider/included_in_memcached_spec.rb, line 8
def after_specing_memcached
  system('kill -KILL `cat /tmp/spider-memcached.pid`')
end
ruby-spider-0.4.4/doc/files/spec/spider/spider_instance_spec_rb.src/0000775000000000000000000000000011205312536024206 5ustar rootrootruby-spider-0.4.4/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html0000664000000000000000000000244011205312536025731 0ustar rootroot mock_failed_http (spec/spider/spider_instance_spec.rb)
# File spec/spider/spider_instance_spec.rb, line 363
  def mock_failed_http
    http_req = stub(:redirect? => false, :success? => false, :code => 404)
    mock_http(http_req)
  end
ruby-spider-0.4.4/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html0000664000000000000000000000473111205312536025740 0ustar rootroot callback_arguments_on (spec/spider/spider_instance_spec.rb)
# File spec/spider/spider_instance_spec.rb, line 381
  def callback_arguments_on(code)
    si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(code) do |a_url, resp, prior_url|
      a_url.should == 'http://example.com/'
      resp.should_not be_nil
      prior_url.should == 'http://foo.com/'
    end
    si.start!
  end
ruby-spider-0.4.4/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html0000664000000000000000000000751211205312536025737 0ustar rootroot mock_redirect_http (spec/spider/spider_instance_spec.rb)
# File spec/spider/spider_instance_spec.rb, line 368
  def mock_redirect_http
    http_req = stub(:redirect? => true, :success? => false, :code => 404)
    http_req.expects(:[]).with('Location').returns('http://example.com/')
    http_req2 = stub(:redirect? => false, :success? => true, :code => 200)
    http_obj = mock(:use_ssl= => true)
    http_obj.expects(:start).
      yields(mock(:request => http_req)).returns(http_req)
    http_obj2 = mock(:use_ssl= => true)
    http_obj2.expects(:start).
      yields(mock(:request => http_req2)).returns(http_req2)
    Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
  end
ruby-spider-0.4.4/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html0000664000000000000000000000332711205312536025745 0ustar rootroot mock_http (spec/spider/spider_instance_spec.rb)
# File spec/spider/spider_instance_spec.rb, line 351
  def mock_http(http_req)
    http_obj = mock(:use_ssl= => true)
    http_obj.expects(:start).
      yields(mock(:request => http_req)).returns(http_req)
    Net::HTTP.expects(:new).returns(http_obj)
  end
ruby-spider-0.4.4/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html0000664000000000000000000000264711205312536025752 0ustar rootroot mock_successful_http (spec/spider/spider_instance_spec.rb)
# File spec/spider/spider_instance_spec.rb, line 358
  def mock_successful_http
    http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body')
    mock_http(http_req)
  end
ruby-spider-0.4.4/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html0000664000000000000000000000401311205312536025732 0ustar rootroot it_should_prevent_cycles_with (spec/spider/spider_instance_spec.rb)
# File spec/spider/spider_instance_spec.rb, line 393
  def it_should_prevent_cycles_with(cacher)
    u = 'http://localhost:8888/'
    u_p = URI.parse(u)
    u2 = 'http://localhost:8888/foo'
    u_p2 = URI.parse(u2)

    with_web_server(LoopingServlet) do
      si = SpiderInstance.new(nil => [u])
      si.check_already_seen_with cacher
      si.start!
    end
  end
ruby-spider-0.4.4/doc/files/spec/spider/spider_instance_spec_rb.html0000664000000000000000000001347711205312536024322 0ustar rootroot File: spider_instance_spec.rb

spider_instance_spec.rb

Path: spec/spider/spider_instance_spec.rb
Last Update: Thu May 21 13:19:06 +0000 2009

Required files

webrick   webrick/https  

Methods

Public Instance methods

[Validate]

ruby-spider-0.4.4/doc/files/spec/spec_helper_rb.html0000664000000000000000000001216211205312536021127 0ustar rootroot File: spec_helper.rb

spec_helper.rb

Path: spec/spec_helper.rb
Last Update: Thu May 21 13:19:06 +0000 2009

Required files

rubygems   webrick   spec  

Methods

Public Instance methods

[Validate]

ruby-spider-0.4.4/doc/files/spec/spider_spec_rb.html0000664000000000000000000000540211205312536021135 0ustar rootroot File: spider_spec.rb

spider_spec.rb

Path: spec/spider_spec.rb
Last Update: Thu May 21 13:19:06 +0000 2009

Methods

Public Instance methods

[Validate]

ruby-spider-0.4.4/doc/files/spec/spider_spec_rb.src/0000775000000000000000000000000011205312536021034 5ustar rootrootruby-spider-0.4.4/doc/files/spec/spider_spec_rb.src/M000014.html0000664000000000000000000000365511205312536022574 0ustar rootroot find_pages_with_static_server (spec/spider_spec.rb)
# File spec/spider_spec.rb, line 25
  def find_pages_with_static_server(&block)
    pages = []
    Spider.start_at('http://localhost:8888/') do |s|
      block.call(s) unless block.nil?
      s.on(:every){ |u,r,p| pages << u }
    end
    pages
  end
ruby-spider-0.4.4/doc/files/spec/spec_helper_rb.src/0000775000000000000000000000000011205312536021025 5ustar rootrootruby-spider-0.4.4/doc/files/spec/spec_helper_rb.src/M000002.html0000664000000000000000000000364511205312536022561 0ustar rootroot with_web_server (spec/spec_helper.rb)
# File spec/spec_helper.rb, line 35
def with_web_server(svlt)
  server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
                                   :AccessLog => [])
  server.mount('/', svlt)
  Thread.new {server.start}
  begin
    yield
  ensure
    server.shutdown
  end
end
ruby-spider-0.4.4/doc/files/spec/spec_helper_rb.src/M000004.html0000664000000000000000000000144411205312536022556 0ustar rootroot be_static_server_pages (spec/spec_helper.rb)
# File spec/spec_helper.rb, line 57
def be_static_server_pages
  BeStaticServerPages.new
end
ruby-spider-0.4.4/doc/files/spec/spec_helper_rb.src/M000003.html0000664000000000000000000000245711205312536022562 0ustar rootroot with_memcached (spec/spec_helper.rb)
# File spec/spec_helper.rb, line 47
def with_memcached
  system('memcached -d -P /tmp/spider-memcached.pid')
  cacher = IncludedInMemcached.new('localhost:11211')
  begin
    yield
  ensure
    system('kill -KILL `cat /tmp/spider-memcached.pid`')
  end
end
ruby-spider-0.4.4/doc/files/spec/spec_helper_rb.src/M000001.html0000664000000000000000000000255711205312536022561 0ustar rootroot local_require (spec/spec_helper.rb)
# File spec/spec_helper.rb, line 7
def local_require(*files)
  files.each do |file|
    require File.dirname(__FILE__)+'/../lib/'+file
  end
end
ruby-spider-0.4.4/doc/files/spec/spec_helper_rb.src/M000005.html0000664000000000000000000000312611205312536022556 0ustar rootroot null_logger (spec/spec_helper.rb)
# File spec/spec_helper.rb, line 85
def null_logger
  l = stub
  [:log, :fatal, :error, :warn , :info, :debug].each do |k|
    l.stubs(k)
    l.stubs("#{k}?".to_sym)
  end
  l
end
ruby-spider-0.4.4/doc/files/lib/0000775000000000000000000000000011205312536015077 5ustar rootrootruby-spider-0.4.4/doc/files/lib/spider/0000775000000000000000000000000011205312536016365 5ustar rootrootruby-spider-0.4.4/doc/files/lib/spider/robot_rules_rb.html0000664000000000000000000000431011205312536022273 0ustar rootroot File: robot_rules.rb

robot_rules.rb

Path: lib/spider/robot_rules.rb
Last Update: Thu May 21 13:19:06 +0000 2009

Understand robots.txt.

Required files

uri  

[Validate]

ruby-spider-0.4.4/doc/files/lib/spider/spider_instance_rb.html0000664000000000000000000000445411205312536023117 0ustar rootroot File: spider_instance.rb

spider_instance.rb

Path: lib/spider/spider_instance.rb
Last Update: Thu May 21 15:38:44 +0000 2009

Specialized spidering rules.

Required files

open-uri   uri   net/http   net/https  

[Validate]

ruby-spider-0.4.4/doc/files/lib/spider/next_urls_in_sqs_rb.html0000664000000000000000000000733011205312536023340 0ustar rootroot File: next_urls_in_sqs.rb

next_urls_in_sqs.rb

Path: lib/spider/next_urls_in_sqs.rb
Last Update: Thu May 21 13:19:06 +0000 2009

Use AmazonSQS to track nodes to visit.

Copyright 2008 John Nagro Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

     * Redistributions of source code must retain the above copyright
     notice, this list of conditions and the following disclaimer.
     * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
     * Neither the name Mike Burns nor the
     names of his contributors may be used to endorse or promote products
     derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS’’ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Required files

rubygems   right_aws   yaml  

[Validate]

ruby-spider-0.4.4/doc/files/lib/spider/included_in_memcached_rb.html0000664000000000000000000000725411205312536024211 0ustar rootroot File: included_in_memcached.rb

included_in_memcached.rb

Path: lib/spider/included_in_memcached.rb
Last Update: Thu May 21 13:19:06 +0000 2009

Use memcached to track cycles.

Copyright 2007 Mike Burns Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

     * Redistributions of source code must retain the above copyright
     notice, this list of conditions and the following disclaimer.
     * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
     * Neither the name Mike Burns nor the
     names of his contributors may be used to endorse or promote products
     derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS’’ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Required files

memcache  

[Validate]

ruby-spider-0.4.4/doc/files/lib/spider_rb.html0000664000000000000000000001343511205312536017744 0ustar rootroot File: spider.rb

spider.rb

Path: lib/spider.rb
Last Update: Thu May 21 13:19:06 +0000 2009

Copyright 2007-2008 Mike Burns & John Nagro

Spider, a Web spidering library for Ruby. It handles the robots.txt, scraping, collecting, and looping so that you can just handle the data.

Examples

Crawl the Web, loading each page in turn, until you run out of memory

 require 'spider'
 Spider.start_at('http://mike-burns.com/') {}

To handle erroneous responses

 require 'spider'
 Spider.start_at('http://mike-burns.com/') do |s|
   s.on :failure do |a_url, resp, prior_url|
     puts "URL failed: #{a_url}"
     puts " linked from #{prior_url}"
   end
 end

Or handle successful responses

 require 'spider'
 Spider.start_at('http://mike-burns.com/') do |s|
   s.on :success do |a_url, resp, prior_url|
     puts "#{a_url}: #{resp.code}"
     puts resp.body
     puts
   end
 end

Limit to just one domain

 require 'spider'
 Spider.start_at('http://mike-burns.com/') do |s|
   s.add_url_check do |a_url|
     a_url =~ %r{^http://mike-burns.com.*}
   end
 end

Pass headers to some requests

 require 'spider'
 Spider.start_at('http://mike-burns.com/') do |s|
   s.setup do |a_url|
     if a_url =~ %r{^http://.*wikipedia.*}
       headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
     end
   end
 end

Use memcached to track cycles

 require 'spider'
 require 'spider/included_in_memcached'
 SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
 Spider.start_at('http://mike-burns.com/') do |s|
   s.check_already_seen_with IncludedInMemcached.new(SERVERS)
 end

Track cycles with a custom object

 require 'spider'
 class ExpireLinks < Hash
   def <<(v)
     self[v] = Time.now
   end
   def include?(v)
     self[v].kind_of?(Time) && (self[v] + 86400) >= Time.now
   end
 end

 Spider.start_at('http://mike-burns.com/') do |s|
   s.check_already_seen_with ExpireLinks.new
 end

Store nodes to visit with Amazon SQS

 require 'spider'
 require 'spider/next_urls_in_sqs'
 Spider.start_at('http://mike-burns.com') do |s|
   s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
 end

Store nodes to visit with a custom object

 require 'spider'
 class MyArray < Array
   def pop
      super
   end

   def push(a_msg)
     super(a_msg)
   end
 end

 Spider.start_at('http://mike-burns.com') do |s|
   s.store_next_urls_with MyArray.new
 end

Create a URL graph

 require 'spider'
 nodes = {}
 Spider.start_at('http://mike-burns.com/') do |s|
   s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }

   s.on(:every) do |a_url, resp, prior_url|
     nodes[prior_url] ||= []
     nodes[prior_url] << a_url
   end
 end

Use a proxy

 require 'net/http_configuration'
 require 'spider'
 http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
                                          :proxy_port => 8881)
 http_conf.apply do
   Spider.start_at('http://img.4chan.org/b/') do |s|
     s.on(:success) do |a_url, resp, prior_url|
       File.open(a_url.gsub('/',':'),'w') do |f|
         f.write(resp.body)
       end
     end
   end
 end

Author

John Nagro john.nagro@gmail.com

Mike Burns mike-burns.com mike@mike-burns.com (original author)

Many thanks to: Matt Horan Henri Cook Sander van der Vliet John Buckley Brian Campbell

With `robot_rules’ from James Edward Gray II via blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589

[Validate]

ruby-spider-0.4.4/doc/files/README.html0000664000000000000000000001210511205312536016153 0ustar rootroot File: README

README

Path: README
Last Update: Thu Nov 08 17:51:17 -0500 2007

Spider, a Web spidering library for Ruby. It handles the robots.txt, scraping, collecting, and looping so that you can just handle the data.

Examples

Crawl the Web, loading each page in turn, until you run out of memory

 require 'spider'
 Spider.start_at('http://mike-burns.com/') {}

To handle erroneous responses

 require 'spider'
 Spider.start_at('http://mike-burns.com/') do |s|
   s.on :failure do |a_url, resp, prior_url|
     puts "URL failed: #{a_url}"
     puts " linked from #{prior_url}"
   end
 end

Or handle successful responses

 require 'spider'
 Spider.start_at('http://mike-burns.com/') do |s|
   s.on :success do |a_url, resp, prior_url|
     puts "#{a_url}: #{resp.code}"
     puts resp.body
     puts
   end
 end

Limit to just one domain

 require 'spider'
 Spider.start_at('http://mike-burns.com/') do |s|
   s.add_url_check do |a_url|
     a_url =~ %r{^http://mike-burns.com.*}
   end
 end

Pass headers to some requests

 require 'spider'
 Spider.start_at('http://mike-burns.com/') do |s|
   s.setup do |a_url|
     if a_url =~ %r{^http://.*wikipedia.*}
       headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
     end
   end
 end

Use memcached to track cycles

 require 'spider'
 require 'spider/included_in_memcached'
 SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
 Spider.start_at('http://mike-burns.com/') do |s|
   s.check_already_seen_with IncludedInMemcached.new(SERVERS)
 end

Track cycles with a custom object

 require 'spider'

 class ExpireLinks < Hash
   def <<(v)
     [v] = Time.now
   end
   def include?(v)
     [v] && (Time.now + 86400) <= [v]
   end
 end

 Spider.start_at('http://mike-burns.com/') do |s|
   s.check_already_seen_with ExpireLinks.new
 end

Create a URL graph

 require 'spider'
 nodes = {}
 Spider.start_at('http://mike-burns.com/') do |s|
   s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }

   s.on(:every) do |a_url, resp, prior_url|
     nodes[prior_url] ||= []
     nodes[prior_url] << a_url
   end
 end

Use a proxy

 require 'net/http_configuration'
 require 'spider'
 http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
                                          :proxy_port => 8881)
 http_conf.apply do
   Spider.start_at('http://img.4chan.org/b/') do |s|
     s.on(:success) do |a_url, resp, prior_url|
       File.open(a_url.gsub('/',':'),'w') do |f|
         f.write(resp.body)
       end
     end
   end
 end

Author

Mike Burns mike-burns.com mike@mike-burns.com

Help from Matt Horan, John Nagro, and Henri Cook.

With `robot_rules’ from James Edward Gray II via blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589

[Validate]

ruby-spider-0.4.4/doc/classes/0000775000000000000000000000000011205312536014664 5ustar rootrootruby-spider-0.4.4/doc/classes/BeStaticServerPages.src/0000775000000000000000000000000011205312536021317 5ustar rootrootruby-spider-0.4.4/doc/classes/BeStaticServerPages.src/M000033.html0000664000000000000000000000143311205312536023050 0ustar rootroot description (BeStaticServerPages)
# File spec/spec_helper.rb, line 30
  def description
    "be the pages returned by the static server (#{@pages.inspect})"
  end
ruby-spider-0.4.4/doc/classes/BeStaticServerPages.src/M000031.html0000664000000000000000000000165111205312536023050 0ustar rootroot matches? (BeStaticServerPages)
# File spec/spec_helper.rb, line 21
  def matches?(actual)
    @actual = actual
    actual == @pages
  end
ruby-spider-0.4.4/doc/classes/BeStaticServerPages.src/M000030.html0000664000000000000000000000164611205312536023053 0ustar rootroot new (BeStaticServerPages)
# File spec/spec_helper.rb, line 14
  def initialize
    @pages = ['http://localhost:8888/', 'http://localhost:8888/foo']
    @actual = nil
  end
ruby-spider-0.4.4/doc/classes/BeStaticServerPages.src/M000032.html0000664000000000000000000000142711205312536023052 0ustar rootroot failure_message (BeStaticServerPages)
# File spec/spec_helper.rb, line 26
  def failure_message
    "expected #{@pages.inspect}, got #{@actual.inspect}"
  end
ruby-spider-0.4.4/doc/classes/Spider.html0000664000000000000000000000761711205312536017013 0ustar rootroot Class: Spider
Class Spider
In: lib/spider.rb
Parent: Object

A spidering library for Ruby. Handles robots.txt, scraping, finding more links, and doing it all over again.

Methods

start_at  

Public Class methods

Runs the spider starting at the given URL. Also takes a block that is given the SpiderInstance. Use the block to define the rules and handlers for the discovered Web pages. See SpiderInstance for the possible rules and handlers.

 Spider.start_at('http://mike-burns.com/') do |s|
   s.add_url_check do |a_url|
     a_url =~ %r{^http://mike-burns.com.*}
   end

   s.on 404 do |a_url, resp, prior_url|
     puts "URL not found: #{a_url}"
   end

   s.on :success do |a_url, resp, prior_url|
     puts "body: #{resp.body}"
   end

   s.on :every do |a_url, resp, prior_url|
     puts "URL returned anything: #{a_url} with this code #{resp.code}"
   end
 end

[Validate]

ruby-spider-0.4.4/doc/classes/RobotRules.html0000664000000000000000000001034511205312536017655 0ustar rootroot Class: RobotRules
Class RobotRules
In: lib/spider/robot_rules.rb
Parent: Object

Based on Perl‘s WWW::RobotRules module, by Gisle Aas.

Methods

allowed?   new   parse  

Public Class methods

Public Instance methods

[Validate]

ruby-spider-0.4.4/doc/classes/IncludedInMemcached.html0000664000000000000000000001212611205312536021361 0ustar rootroot Class: IncludedInMemcached
Class IncludedInMemcached
In: lib/spider/included_in_memcached.rb
Parent: Object

A specialized class using memcached to track items stored. It supports three operations: new, <<, and include? . Together these can be used to add items to the memcache, then determine whether the item has been added.

To use it with Spider use the check_already_seen_with method:

 Spider.start_at('http://example.com/') do |s|
   s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
 end

Methods

<<   include?   new  

Public Class methods

Construct a new IncludedInMemcached instance. All arguments here are passed to MemCache (part of the memcache-client gem).

Public Instance methods

Add an item to the memcache.

True if the item is in the memcache.

[Validate]

ruby-spider-0.4.4/doc/classes/IncludedInMemcached.src/0000775000000000000000000000000011205312536021257 5ustar rootrootruby-spider-0.4.4/doc/classes/IncludedInMemcached.src/M000016.html0000664000000000000000000000164611205312536023017 0ustar rootroot << (IncludedInMemcached)
# File lib/spider/included_in_memcached.rb, line 45
  def <<(v)
    @c.add(v.to_s, v)
  end
ruby-spider-0.4.4/doc/classes/IncludedInMemcached.src/M000017.html0000664000000000000000000000171511205312536023015 0ustar rootroot include? (IncludedInMemcached)
# File lib/spider/included_in_memcached.rb, line 50
  def include?(v)
    @c.get(v.to_s) == v
  end
ruby-spider-0.4.4/doc/classes/IncludedInMemcached.src/M000015.html0000664000000000000000000000171111205312536023007 0ustar rootroot new (IncludedInMemcached)
# File lib/spider/included_in_memcached.rb, line 40
  def initialize(*a)
    @c = MemCache.new(*a)
  end
ruby-spider-0.4.4/doc/classes/NextUrlsInSQS.html0000664000000000000000000001241111205312536020213 0ustar rootroot Class: NextUrlsInSQS
Class NextUrlsInSQS
In: lib/spider/next_urls_in_sqs.rb
Parent: Object

A specialized class using AmazonSQS to track nodes to walk. It supports two operations: push and pop . Together these can be used to add items to the queue, then pull items off the queue.

This is useful if you want multiple Spider processes crawling the same data set.

To use it with Spider use the store_next_urls_with method:

 Spider.start_at('http://example.com/') do |s|
   s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
 end

Methods

new   pop   push  

Public Class methods

Construct a new NextUrlsInSQS instance. All arguments here are passed to RightAWS::SqsGen2 (part of the right_aws gem) or used to set the AmazonSQS queue name (optional).

Public Instance methods

Pull an item off the queue, loop until data is found. Data is encoded with YAML.

Put data on the queue. Data is encoded with YAML.

[Validate]

ruby-spider-0.4.4/doc/classes/QueryServlet.html0000664000000000000000000000602411205312536020226 0ustar rootroot Class: QueryServlet
Class QueryServlet
In: spec/spec_helper.rb
Parent: WEBrick::HTTPServlet::AbstractServlet

Methods

do_GET  

Public Instance methods

[Validate]

ruby-spider-0.4.4/doc/classes/NextUrlsInSQS.src/0000775000000000000000000000000011205312536020114 5ustar rootrootruby-spider-0.4.4/doc/classes/NextUrlsInSQS.src/M000020.html0000664000000000000000000000205511205312536021642 0ustar rootroot push (NextUrlsInSQS)
# File lib/spider/next_urls_in_sqs.rb, line 62
  def push(a_msg)
    encoded_message = YAML::dump(a_msg)
    @queue.push(a_msg)
  end
ruby-spider-0.4.4/doc/classes/NextUrlsInSQS.src/M000019.html0000664000000000000000000000260211205312536021650 0ustar rootroot pop (NextUrlsInSQS)
# File lib/spider/next_urls_in_sqs.rb, line 53
  def pop
    while true
      message = @queue.pop
      return YAML::load(message.to_s) unless message.nil?
      sleep 5
    end
  end
ruby-spider-0.4.4/doc/classes/NextUrlsInSQS.src/M000018.html0000664000000000000000000000255211205312536021653 0ustar rootroot new (NextUrlsInSQS)
# File lib/spider/next_urls_in_sqs.rb, line 46
  def initialize(aws_access_key, aws_secret_access_key, queue_name = 'ruby-spider')
    @sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
    @queue = @sqs.queue(queue_name)
  end
ruby-spider-0.4.4/doc/classes/SpiderInstance.src/0000775000000000000000000000000011205312536020365 5ustar rootrootruby-spider-0.4.4/doc/classes/SpiderInstance.src/M000028.html0000664000000000000000000000135011205312536022120 0ustar rootroot clear_headers (SpiderInstance)
# File lib/spider/spider_instance.rb, line 182
  def clear_headers
    @headers = {}
  end
ruby-spider-0.4.4/doc/classes/SpiderInstance.src/M000022.html0000664000000000000000000000274311205312536022121 0ustar rootroot check_already_seen_with (SpiderInstance)
# File lib/spider/spider_instance.rb, line 91
  def check_already_seen_with(cacher)
    if cacher.respond_to?(:<<) && cacher.respond_to?(:include?)
      @seen = cacher
    else
      raise ArgumentError, 'expected something that responds to << and included?'
    end
  end
ruby-spider-0.4.4/doc/classes/SpiderInstance.src/M000025.html0000664000000000000000000000207211205312536022117 0ustar rootroot setup (SpiderInstance)
# File lib/spider/spider_instance.rb, line 159
  def setup(p = nil, &block)
    @setup = p ? p : block
  end
ruby-spider-0.4.4/doc/classes/SpiderInstance.src/M000027.html0000664000000000000000000000146311205312536022124 0ustar rootroot headers (SpiderInstance)
# File lib/spider/spider_instance.rb, line 170
  def headers
    HeaderSetter.new(self)
  end
ruby-spider-0.4.4/doc/classes/SpiderInstance.src/M000026.html0000664000000000000000000000210311205312536022113 0ustar rootroot teardown (SpiderInstance)
# File lib/spider/spider_instance.rb, line 164
  def teardown(p = nil, &block)
    @teardown = p ? p : block
  end
ruby-spider-0.4.4/doc/classes/SpiderInstance.src/M000023.html0000664000000000000000000000255311205312536022121 0ustar rootroot store_next_urls_with (SpiderInstance)
# File lib/spider/spider_instance.rb, line 115
  def store_next_urls_with(a_store)
    tmp_next_urls = @next_urls
    @next_urls = a_store
    tmp_next_urls.each do |a_url_hash|
      @next_urls.push a_url_hash
    end
  end
ruby-spider-0.4.4/doc/classes/SpiderInstance.src/M000021.html0000664000000000000000000000162011205312536022111 0ustar rootroot add_url_check (SpiderInstance)
# File lib/spider/spider_instance.rb, line 70
  def add_url_check(&block)
    @url_checks << block
  end
ruby-spider-0.4.4/doc/classes/SpiderInstance.src/M000024.html0000664000000000000000000000323711205312536022122 0ustar rootroot on (SpiderInstance)
# File lib/spider/spider_instance.rb, line 145
  def on(code, p = nil, &block)
    f = p ? p : block
    case code
    when Fixnum
      @callbacks[code] = f
    else
      @callbacks[code.to_sym] = f
    end
  end
ruby-spider-0.4.4/doc/classes/BeStaticServerPages.html0000664000000000000000000001211411205312536021416 0ustar rootroot Class: BeStaticServerPages
Class BeStaticServerPages
In: spec/spec_helper.rb
Parent: Object

Methods

Attributes

actual  [RW] 

Public Class methods

Public Instance methods

[Validate]

ruby-spider-0.4.4/doc/classes/RobotRules.src/0000775000000000000000000000000011205312536017552 5ustar rootrootruby-spider-0.4.4/doc/classes/RobotRules.src/M000035.html0000664000000000000000000001656111205312536021315 0ustar rootroot parse (RobotRules)
# File lib/spider/robot_rules.rb, line 15
  def parse( text_uri, robots_data )
    uri      = URI.parse(text_uri)
    location = "#{uri.host}:#{uri.port}"
    @rules.delete(location)

    rules      = robots_data.split(/[\015\012]+/).map do |rule|
      rule.sub(/\s*#.*$/, "")
    end
    anon_rules = Array.new
    my_rules   = Array.new
    current    = anon_rules
    rules.each do |rule|
      case rule
      when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
        break unless my_rules.empty?

        current = if $1 == "*"
                    anon_rules
                  elsif $1.downcase.index(@user_agent)
                    my_rules
                  else
                    nil
                  end
      when /^\s*Disallow\s*:\s*(.*?)\s*$/i
        next if current.nil?

        if $1.empty?
          current << nil
        else
          disallow = URI.parse($1)

          next unless disallow.scheme.nil? or disallow.scheme ==  
            uri.scheme
          next unless disallow.port.nil?   or disallow.port == uri.port
          next unless disallow.host.nil?   or
          disallow.host.downcase == uri.host.downcase

          disallow = disallow.path
          disallow = "/"            if disallow.empty?
          disallow = "/#{disallow}" unless disallow[0] == ?/

          current << disallow
        end
      end
    end

    @rules[location] = if my_rules.empty?
                         anon_rules.compact
                       else
                         my_rules.compact
                       end
  end
ruby-spider-0.4.4/doc/classes/RobotRules.src/M000036.html0000664000000000000000000000365411205312536021315 0ustar rootroot allowed? (RobotRules)
# File lib/spider/robot_rules.rb, line 68
  def allowed?( text_uri )
    uri      = URI.parse(text_uri)
    location = "#{uri.host}:#{uri.port}"
    path     = uri.path

    return true unless %w{http https}.include?(uri.scheme)

    not @rules[location].any? { |rule| path.index(rule) == 0 }
  end
ruby-spider-0.4.4/doc/classes/RobotRules.src/M000034.html0000664000000000000000000000307111205312536021304 0ustar rootroot new (RobotRules)
# File lib/spider/robot_rules.rb, line 10
  def initialize( user_agent )
    @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*}, "").downcase
    @rules      = Hash.new { |rules, rule| rules[rule] = Array.new }
  end
ruby-spider-0.4.4/doc/classes/LoopingServlet.html0000664000000000000000000000603411205312536020531 0ustar rootroot Class: LoopingServlet
Class LoopingServlet
In: spec/spec_helper.rb
Parent: WEBrick::HTTPServlet::AbstractServlet

Methods

do_GET  

Public Instance methods

[Validate]

ruby-spider-0.4.4/doc/classes/LoopingServlet.src/0000775000000000000000000000000011205312536020426 5ustar rootrootruby-spider-0.4.4/doc/classes/LoopingServlet.src/M000037.html0000664000000000000000000000304111205312536022160 0ustar rootroot do_GET (LoopingServlet)
# File spec/spec_helper.rb, line 69
  def do_GET(req, res)
    res['Content-type'] = 'text/html'
    if req.path == '/foo'
      res.body = "<a href=\"/\">a</a>\n"
    else
      res.body = "<a href=\"/foo\">b</a>\n"
    end
  end
ruby-spider-0.4.4/doc/classes/SpiderInstance.html0000664000000000000000000002330211205312536020465 0ustar rootroot Class: SpiderInstance
Class SpiderInstance
In: lib/spider/spider_instance.rb
Parent: Object

Methods

Public Instance methods

Add a predicate that determines whether to continue down this URL‘s path. All predicates must be true in order for a URL to proceed.

Takes a block that takes a string and produces a boolean. For example, this will ensure that the URL starts with ‘mike-burns.com’:

 add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}

The Web is a graph; to avoid cycles we store the nodes (URLs) already visited. The Web is a really, really, really big graph; as such, this list of visited nodes grows really, really, really big.

Change the object used to store these seen nodes with this. The default object is an instance of Array. Available with Spider is a wrapper of memcached.

You can implement a custom class for this; any object passed to check_already_seen_with must understand just << and included? .

 # default
 check_already_seen_with Array.new

 # memcached
 require 'spider/included_in_memcached'
 check_already_seen_with IncludedInMemcached.new('localhost:11211')

Reset the headers hash.

Use like a hash:

 headers['Cookies'] = 'user_id=1;password=btrross3'

Add a response handler. A response handler‘s trigger can be :every, :success, :failure, or any HTTP status code. The handler itself can be either a Proc or a block.

The arguments to the block are: the URL as a string, an instance of Net::HTTPResponse, and the prior URL as a string.

For example:

 on 404 do |a_url, resp, prior_url|
   puts "URL not found: #{a_url}"
 end

 on :success do |a_url, resp, prior_url|
   puts a_url
   puts resp.body
 end

 on :every do |a_url, resp, prior_url|
   puts "Given this code: #{resp.code}"
 end

Run before the HTTP request. Given the URL as a string.

 setup do |a_url|
   headers['Cookies'] = 'user_id=1;admin=true'
 end

The Web is a really, really, really big graph; as such, this list of nodes to visit grows really, really, really big.

Change the object used to store nodes we have yet to walk. The default object is an instance of Array. Available with Spider is a wrapper of AmazonSQS.

You can implement a custom class for this; any object passed to check_already_seen_with must understand just push and pop .

 # default
 store_next_urls_with Array.new

 # AmazonSQS
 require 'spider/next_urls_in_sqs'
 store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)

Run last, once for each page. Given the URL as a string.

[Validate]

ruby-spider-0.4.4/doc/classes/Spider.src/0000775000000000000000000000000011205312536016700 5ustar rootrootruby-spider-0.4.4/doc/classes/Spider.src/M000029.html0000664000000000000000000000305111205312536020434 0ustar rootroot start_at (Spider)
# File lib/spider.rb, line 54
  def self.start_at(a_url, &block)
    rules    = RobotRules.new('Ruby Spider 1.0')
    a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
    block.call(a_spider)
    a_spider.start!
  end
ruby-spider-0.4.4/doc/classes/QueryServlet.src/0000775000000000000000000000000011205312536020124 5ustar rootrootruby-spider-0.4.4/doc/classes/QueryServlet.src/M000038.html0000664000000000000000000000203011205312536021654 0ustar rootroot do_GET (QueryServlet)
# File spec/spec_helper.rb, line 62
  def do_GET(req, res)
    res['Content-type'] = 'text/plain'
    res.body = "response\n"
  end
ruby-spider-0.4.4/doc/fr_file_index.html0000664000000000000000000000261611205312536016717 0ustar rootroot Files

Files

lib/spider.rb
lib/spider/included_in_memcached.rb
lib/spider/next_urls_in_sqs.rb
lib/spider/robot_rules.rb
lib/spider/spider_instance.rb
spec/spec_helper.rb
spec/spider/included_in_memcached_spec.rb
spec/spider/spider_instance_spec.rb
spec/spider_spec.rb
ruby-spider-0.4.4/doc/fr_class_index.html0000664000000000000000000000215111205312536017077 0ustar rootroot Classes

Classes

BeStaticServerPages
IncludedInMemcached
LoopingServlet
NextUrlsInSQS
QueryServlet
RobotRules
Spider
SpiderInstance
ruby-spider-0.4.4/doc/index.html0000664000000000000000000000136311205312536015227 0ustar rootroot RDoc Documentation ruby-spider-0.4.4/doc/created.rid0000664000000000000000000000004011205312536015330 0ustar rootrootThu, 21 May 2009 15:42:01 +0000 ruby-spider-0.4.4/doc/rdoc-style.css0000664000000000000000000001033211205312536016025 0ustar rootroot body { font-family: Verdana,Arial,Helvetica,sans-serif; font-size: 90%; margin: 0; margin-left: 40px; padding: 0; background: white; } h1,h2,h3,h4 { margin: 0; color: #efefef; background: transparent; } h1 { font-size: 150%; } h2,h3,h4 { margin-top: 1em; } a { background: #eef; color: #039; text-decoration: none; } a:hover { background: #039; color: #eef; } /* Override the base stylesheet's Anchor inside a table cell */ td > a { background: transparent; color: #039; text-decoration: none; } /* and inside a section title */ .section-title > a { background: transparent; color: #eee; text-decoration: none; } /* === Structural elements =================================== */ div#index { margin: 0; margin-left: -40px; padding: 0; font-size: 90%; } div#index a { margin-left: 0.7em; } div#index .section-bar { margin-left: 0px; padding-left: 0.7em; background: #ccc; font-size: small; } div#classHeader, div#fileHeader { width: auto; color: white; padding: 0.5em 1.5em 0.5em 1.5em; margin: 0; margin-left: -40px; border-bottom: 3px solid #006; } div#classHeader a, div#fileHeader a { background: inherit; color: white; } div#classHeader td, div#fileHeader td { background: inherit; color: white; } div#fileHeader { background: #057; } div#classHeader { background: #048; } .class-name-in-header { font-size: 180%; font-weight: bold; } div#bodyContent { padding: 0 1.5em 0 1.5em; } div#description { padding: 0.5em 1.5em; background: #efefef; border: 1px dotted #999; } div#description h1,h2,h3,h4,h5,h6 { color: #125;; background: transparent; } div#validator-badges { text-align: center; } div#validator-badges img { border: 0; } div#copyright { color: #333; background: #efefef; font: 0.75em sans-serif; margin-top: 5em; margin-bottom: 0; padding: 0.5em 2em; } /* === Classes =================================== */ table.header-table { color: white; font-size: small; } .type-note { font-size: small; color: #DEDEDE; } .xxsection-bar { background: #eee; color: #333; padding: 3px; } .section-bar { color: #333; border-bottom: 1px solid #999; margin-left: -20px; } .section-title { background: #79a; color: #eee; padding: 3px; margin-top: 2em; margin-left: -30px; border: 1px solid #999; } .top-aligned-row { vertical-align: top } .bottom-aligned-row { vertical-align: bottom } /* --- Context section classes ----------------------- */ .context-row { } .context-item-name { font-family: monospace; font-weight: bold; color: black; } .context-item-value { font-size: small; color: #448; } .context-item-desc { color: #333; padding-left: 2em; } /* --- Method classes -------------------------- */ .method-detail { background: #efefef; padding: 0; margin-top: 0.5em; margin-bottom: 1em; border: 1px dotted #ccc; } .method-heading { color: black; background: #ccc; border-bottom: 1px solid #666; padding: 0.2em 0.5em 0 0.5em; } .method-signature { color: black; background: inherit; } .method-name { font-weight: bold; } .method-args { font-style: italic; } .method-description { padding: 0 0.5em 0 0.5em; } /* --- Source code sections -------------------- */ a.source-toggle { font-size: 90%; } div.method-source-code { background: #262626; color: #ffdead; margin: 1em; padding: 0.5em; border: 1px dashed #999; overflow: hidden; } div.method-source-code pre { color: #ffdead; overflow: hidden; } /* --- Ruby keyword styles --------------------- */ .standalone-code { background: #221111; color: #ffdead; overflow: hidden; } .ruby-constant { color: #7fffd4; background: transparent; } .ruby-keyword { color: #00ffff; background: transparent; } .ruby-ivar { color: #eedd82; background: transparent; } .ruby-operator { color: #00ffee; background: transparent; } .ruby-identifier { color: #ffdead; background: transparent; } .ruby-node { color: #ffa07a; background: transparent; } .ruby-comment { color: #b22222; font-weight: bold; background: transparent; } .ruby-regexp { color: #ffa07a; background: transparent; } .ruby-value { color: #7fffd4; background: transparent; }ruby-spider-0.4.4/spec/0000775000000000000000000000000011205312536013414 5ustar rootrootruby-spider-0.4.4/spec/spider/0000775000000000000000000000000011205312536014702 5ustar rootrootruby-spider-0.4.4/spec/spider/included_in_memcached_spec.rb0000664000000000000000000000200111205312536022475 0ustar rootrootrequire File.dirname(__FILE__)+'/../spec_helper' def before_specing_memcached local_require 'spider/included_in_memcached' system('memcached -d -P /tmp/spider-memcached.pid') end def after_specing_memcached system('kill -KILL `cat /tmp/spider-memcached.pid`') end Spec::Runner.configure { |c| c.mock_with :mocha } describe 'Object to halt cycles' do before do before_specing_memcached end it 'should understand <<' do c = IncludedInMemcached.new('localhost:11211') c.should respond_to(:<<) end it 'should understand included?' do c = IncludedInMemcached.new('localhost:11211') c.should respond_to(:include?) end it 'should produce false if the object is not included' do c = IncludedInMemcached.new('localhost:11211') c.include?('a').should be_false end it 'should produce true if the object is included' do c = IncludedInMemcached.new('localhost:11211') c << 'a' c.include?('a').should be_true end after do after_specing_memcached end end ruby-spider-0.4.4/spec/spider/spider_instance_spec.rb0000664000000000000000000003327611205312536021426 0ustar rootrootrequire File.dirname(__FILE__)+'/../spec_helper' require 'webrick' require 'webrick/https' local_require 'spider', 'spider/included_in_memcached' describe 'SpiderInstance' do # http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete # URL. Bug reported by Henri Cook. it 'should construct a complete redirect URL' do @response_called = false redirected_resp = stub(:redirect? => true, :[] => '/default.htm') success_resp = stub(:redirect? => false) http_req = stub(:request => true) http_mock_redir = stub(:use_ssl= => true) http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp) http_mock_success = stub(:use_ssl= => true) http_mock_success.stubs(:start).yields(http_req).returns(success_resp) Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then. returns(http_mock_success) si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']}) si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp| @response_called = true end @response_called.should be_true end it 'should prevent cycles with an IncludedInMemcached' do with_memcached do cacher = IncludedInMemcached.new('localhost:11211') it_should_prevent_cycles_with(cacher) end end it 'should prevent cycles with an Array' do cacher = Array.new it_should_prevent_cycles_with(cacher) end it 'should call the "setup" callback before loading the Web page' do mock_successful_http @on_called = false @before_called = false si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.setup { |*a| @before_called = Time.now } si.on(:every) { |*a| @on_called = Time.now } si.start! @on_called.should_not be_false @before_called.should_not be_false @before_called.should_not be_false @before_called.should < @on_called end it 'should call the "teardown" callback after running all other callbacks' do mock_successful_http @on_called = false @after_called = false si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(:every) { |*a| @on_called = Time.now } si.teardown { |*a| @after_called = Time.now } si.start! @on_called.should_not be_false @after_called.should_not be_false @after_called.should_not be_false @after_called.should > @on_called end it 'should pass headers set by a setup handler to the HTTP request' do mock_successful_http Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'}) si = SpiderInstance.new(nil => ['http://example.com/foo']) si.stubs(:allowable_url?).returns(true) si.stubs(:generate_next_urls).returns([]) si.setup do |a_url| si.headers['X-Header-Set'] = 'True' end si.teardown do |a_url| si.clear_headers end si.start! end it 'should call the :every callback with the current URL, the response, and the prior URL' do mock_successful_http callback_arguments_on(:every) end it 'should call the :success callback with the current URL, the request, and the prior URL' do mock_successful_http callback_arguments_on(:success) end it 'should call the :failure callback with the current URL, the request, and the prior URL' do mock_failed_http callback_arguments_on(:failure) end it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do mock_failed_http callback_arguments_on(404) end it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do mock_successful_http callback_arguments_on(200) end # Bug reported by John Nagro, using the example source http://eons.com/ # had to change line 192; uses request_uri now instead of path. it 'should handle query URLs without a path' do u = 'http://localhost:8888?s=1' u_p = URI.parse(u) @block_called = false with_web_server(QueryServlet) do si = SpiderInstance.new({nil => [u]}) si.get_page(u_p) do @block_called = true end end @block_called.should be_true end # This solves a problem reported by John Nagro. it 'should handle redirects' do u = 'http://example.com/' u_p = URI.parse(u) @redirect_handled = false mock_redirect_http si = SpiderInstance.new({nil => [u]}) si.get_page(u_p) do @redirect_handled = true end @redirect_handled.should be_true end it 'should handle HTTPS' do u = 'https://localhost:10443/' u_p = URI.parse(u) @page_called = false server = WEBrick::HTTPServer.new(:Port => 10443, :Logger => null_logger, :AccessLog => [], :SSLEnable => true, :SSLCertName => [["O", "ruby-lang.org"], ["OU", "sample"], ["CN", WEBrick::Utils::getservername]], :SSLComment => 'Comment of some sort') server.mount('/', QueryServlet) Thread.new {server.start} si = SpiderInstance.new({nil => [u]}) si.get_page(u_p) { @page_called = true } server.shutdown @page_called.should be_true end it 'should skip URLs when allowable_url? is false' do u = 'http://example.com/' u_p = URI.parse(u) http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1) Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil)) si = SpiderInstance.new({nil => [u]}) si.expects(:allowable_url?).with(u, u_p).returns(false) si.expects(:get_page).times(0) si.start! end it 'should not skip URLs when allowable_url? is true' do u = 'http://example.com/' u_p = URI.parse(u) http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1) Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil)) si = SpiderInstance.new({nil => [u]}) si.expects(:allowable_url?).with(u, u_p).returns(true) si.expects(:get_page).with(URI.parse(u)) si.start! end it 'should disallow URLs when the robots.txt says to' do robot_rules = stub SpiderInstance.any_instance.expects(:open). with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider', 'Accept' => 'text/html,text/xml,application/xml,text/plain'). yields(stub(:read => 'robots.txt content')) robot_rules.expects(:parse).with('http://example.com:80/robots.txt', 'robots.txt content') robot_rules.expects(:allowed?).with('http://example.com/').returns(false) si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, []) allowable = si.allowable_url?('http://example.com/', URI.parse('http://example.com/')) allowable.should be_false end it 'should disallow URLs when they fail any url_check' do si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.add_url_check { |a_url| false } allowable = si.allowable_url?('http://example.com/', URI.parse('http://example.com/')) allowable.should be_false end it 'should support multiple url_checks' do @first_url_check = false @second_url_check = false si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.add_url_check do |a_url| @first_url_check = true true end si.add_url_check do |a_url| @second_url_check = true false end allowable = si.allowable_url?('http://example.com/', URI.parse('http://example.com/')) allowable.should be_false @first_url_check.should be_true @second_url_check.should be_true end it 'should avoid cycles' do u = 'http://example.com/' u_p = URI.parse(u) si = SpiderInstance.new({nil => [u]}, [u_p]) si.stubs(:allowed?).returns(true) allowable = si.allowable_url?(u, u_p) allowable.should be_false u_p.should_not be_nil end it 'should call the 404 handler for 404s' do @proc_called = false mock_failed_http si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(404) {|*a| @proc_called = true} si.start! @proc_called.should be_true end it 'should call the :success handler on success' do @proc_called = false mock_successful_http si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(:success) {|*a| @proc_called = true} si.start! @proc_called.should be_true end it 'should not call the :success handler on failure' do @proc_called = false mock_failed_http si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(:success) {|*a| @proc_called = true} si.start! @proc_called.should be_false end it 'should call the :success handler and the 200 handler on 200' do @proc_200_called = false @proc_success_called = false mock_successful_http si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(:success) {|*a| @proc_success_called = true} si.on(200) {|*a| @proc_200_called = true} si.start! @proc_200_called.should be_true @proc_success_called.should be_true end it 'should not call the :failure handler on success' do @proc_called = false mock_successful_http si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(:failure) {|*a| @proc_called = true} si.start! @proc_called.should be_false end it 'should call the :failure handler on failure' do @proc_called = false mock_failed_http si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(:failure) {|*a| @proc_called = true} si.start! @proc_called.should be_true end it 'should call the :failure handler and the 404 handler on 404' do @proc_404_called = false @proc_failure_called = false mock_failed_http si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(:failure) {|*a| @proc_failure_called = true} si.on(404) {|*a| @proc_404_called = true} si.start! @proc_404_called.should be_true @proc_failure_called.should be_true end it 'should call the :every handler even when a handler for the error code is defined' do @any_called = false mock_successful_http si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(:every) { |*a| @any_called = true } si.on(202) {|*a|} si.start! @any_called.should be_true end it 'should support a block as a response handler' do @proc_called = false mock_successful_http si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(:every) { |*a| @proc_called = true } si.start! @proc_called.should be_true end it 'should support a proc as a response handler' do @proc_called = false mock_successful_http si = SpiderInstance.new({nil => ['http://example.com/']}) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(:every, Proc.new { |*a| @proc_called = true }) si.start! @proc_called.should be_true end def mock_http(http_req) http_obj = mock(:use_ssl= => true) http_obj.expects(:start). yields(mock(:request => http_req)).returns(http_req) Net::HTTP.expects(:new).returns(http_obj) end def mock_successful_http http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body') mock_http(http_req) end def mock_failed_http http_req = stub(:redirect? => false, :success? => false, :code => 404) mock_http(http_req) end def mock_redirect_http http_req = stub(:redirect? => true, :success? => false, :code => 404) http_req.expects(:[]).with('Location').returns('http://example.com/') http_req2 = stub(:redirect? => false, :success? => true, :code => 200) http_obj = mock(:use_ssl= => true) http_obj.expects(:start). yields(mock(:request => http_req)).returns(http_req) http_obj2 = mock(:use_ssl= => true) http_obj2.expects(:start). yields(mock(:request => http_req2)).returns(http_req2) Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2) end def callback_arguments_on(code) si = SpiderInstance.new('http://foo.com/' => ['http://example.com/']) si.stubs(:allowed?).returns(true) si.stubs(:generate_next_urls).returns([]) si.on(code) do |a_url, resp, prior_url| a_url.should == 'http://example.com/' resp.should_not be_nil prior_url.should == 'http://foo.com/' end si.start! end def it_should_prevent_cycles_with(cacher) u = 'http://localhost:8888/' u_p = URI.parse(u) u2 = 'http://localhost:8888/foo' u_p2 = URI.parse(u2) with_web_server(LoopingServlet) do si = SpiderInstance.new(nil => [u]) si.check_already_seen_with cacher si.start! end end end ruby-spider-0.4.4/spec/spider_spec.rb0000664000000000000000000000156311205312536016246 0ustar rootrootrequire File.dirname(__FILE__)+'/spec_helper' local_require 'spider', 'spider/included_in_memcached' describe 'Spider' do it 'should find two pages without cycles using defaults' do u = [] with_web_server(LoopingServlet) do u = find_pages_with_static_server end u.should be_static_server_pages end it 'should find two pages without cycles using memcached' do u = [] with_web_server(LoopingServlet) do with_memcached do u = find_pages_with_static_server do |s| s.check_already_seen_with IncludedInMemcached.new('localhost:11211') end end end u.should be_static_server_pages end def find_pages_with_static_server(&block) pages = [] Spider.start_at('http://localhost:8888/') do |s| block.call(s) unless block.nil? s.on(:every){ |u,r,p| pages << u } end pages end end ruby-spider-0.4.4/spec/spec_helper.rb0000664000000000000000000000335611205312536016241 0ustar rootrootrequire 'rubygems' require 'webrick' require 'spec' Spec::Runner.configure { |c| c.mock_with :mocha } def local_require(*files) files.each do |file| require File.dirname(__FILE__)+'/../lib/'+file end end class BeStaticServerPages def initialize @pages = ['http://localhost:8888/', 'http://localhost:8888/foo'] @actual = nil end attr :actual, true def matches?(actual) @actual = actual actual == @pages end def failure_message "expected #{@pages.inspect}, got #{@actual.inspect}" end def description "be the pages returned by the static server (#{@pages.inspect})" end end def with_web_server(svlt) server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger, :AccessLog => []) server.mount('/', svlt) Thread.new {server.start} begin yield ensure server.shutdown end end def with_memcached system('memcached -d -P /tmp/spider-memcached.pid') cacher = IncludedInMemcached.new('localhost:11211') begin yield ensure system('kill -KILL `cat /tmp/spider-memcached.pid`') end end def be_static_server_pages BeStaticServerPages.new end class QueryServlet < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) res['Content-type'] = 'text/plain' res.body = "response\n" end end class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet def do_GET(req, res) res['Content-type'] = 'text/html' if req.path == '/foo' res.body = <<-END a END else res.body = <<-END b END end end end def null_logger l = stub [:log, :fatal, :error, :warn , :info, :debug].each do |k| l.stubs(k) l.stubs("#{k}?".to_sym) end l end ruby-spider-0.4.4/spider.gemspec0000664000000000000000000000101611205312536015313 0ustar rootrootrequire 'rubygems' spec = Gem::Specification.new do |s| s.author = 'John Nagro' s.email = 'john.nagro@gmail.com' s.has_rdoc = true s.homepage = 'http://spider.rubyforge.org/' s.name = 'spider' s.rubyforge_project = 'spider' s.summary = 'A Web spidering library' s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i } s.require_path = 'lib' s.description = <<-EOF A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again. EOF s.version = '0.4.4' end ruby-spider-0.4.4/lib/0000775000000000000000000000000011205312536013230 5ustar rootrootruby-spider-0.4.4/lib/spider/0000775000000000000000000000000011205312536014516 5ustar rootrootruby-spider-0.4.4/lib/spider/robot_rules.rb0000664000000000000000000000416211205312536017405 0ustar rootroot# Understand robots.txt. # Created by James Edward Gray II on 2006-01-31. # Copyright 2006 Gray Productions. All rights reserved. require "uri" # Based on Perl's WWW::RobotRules module, by Gisle Aas. class RobotRules def initialize( user_agent ) @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*}, "").downcase @rules = Hash.new { |rules, rule| rules[rule] = Array.new } end def parse( text_uri, robots_data ) uri = URI.parse(text_uri) location = "#{uri.host}:#{uri.port}" @rules.delete(location) rules = robots_data.split(/[\015\012]+/).map do |rule| rule.sub(/\s*#.*$/, "") end anon_rules = Array.new my_rules = Array.new current = anon_rules rules.each do |rule| case rule when /^\s*User-Agent\s*:\s*(.+?)\s*$/i break unless my_rules.empty? current = if $1 == "*" anon_rules elsif $1.downcase.index(@user_agent) my_rules else nil end when /^\s*Disallow\s*:\s*(.*?)\s*$/i next if current.nil? if $1.empty? current << nil else disallow = URI.parse($1) next unless disallow.scheme.nil? or disallow.scheme == uri.scheme next unless disallow.port.nil? or disallow.port == uri.port next unless disallow.host.nil? or disallow.host.downcase == uri.host.downcase disallow = disallow.path disallow = "/" if disallow.empty? disallow = "/#{disallow}" unless disallow[0] == ?/ current << disallow end end end @rules[location] = if my_rules.empty? anon_rules.compact else my_rules.compact end end def allowed?( text_uri ) uri = URI.parse(text_uri) location = "#{uri.host}:#{uri.port}" path = uri.path return true unless %w{http https}.include?(uri.scheme) not @rules[location].any? { |rule| path.index(rule) == 0 } end end ruby-spider-0.4.4/lib/spider/included_in_memcached.rb0000664000000000000000000000442511205312536021313 0ustar rootroot# Use memcached to track cycles. # # Copyright 2007 Mike Burns # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name Mike Burns nor the # names of his contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. require 'memcache' # A specialized class using memcached to track items stored. It supports # three operations: new, <<, and include? . Together these can be used to # add items to the memcache, then determine whether the item has been added. # # To use it with Spider use the check_already_seen_with method: # # Spider.start_at('http://example.com/') do |s| # s.check_already_seen_with IncludedInMemcached.new('localhost:11211') # end class IncludedInMemcached # Construct a new IncludedInMemcached instance. All arguments here are # passed to MemCache (part of the memcache-client gem). def initialize(*a) @c = MemCache.new(*a) end # Add an item to the memcache. def <<(v) @c.add(v.to_s, v) end # True if the item is in the memcache. def include?(v) @c.get(v.to_s) == v end end ruby-spider-0.4.4/lib/spider/spider_instance.rb0000664000000000000000000002447311205312536020227 0ustar rootroot# Specialized spidering rules. # Copyright 2007-2008 Mike Burns & John Nagro # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name Mike Burns nor the # names of his contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. require File.dirname(__FILE__)+'/robot_rules.rb' require 'open-uri' require 'uri' require 'net/http' require 'net/https' module Net #:nodoc: class HTTPResponse #:nodoc: def success?; false; end def redirect?; false; end end class HTTPSuccess #:nodoc: def success?; true; end end class HTTPRedirection #:nodoc: def redirect?; true; end end end class NilClass #:nodoc: def merge(h); h; end end class SpiderInstance def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc: @url_checks = [] @cache = :memory @callbacks = {} @next_urls = [next_urls] @seen = seen @rules = rules || RobotRules.new('Ruby Spider 0.4.4') @robots_seen = robots_seen @headers = {} @setup = nil @teardown = nil end # Add a predicate that determines whether to continue down this URL's path. # All predicates must be true in order for a URL to proceed. # # Takes a block that takes a string and produces a boolean. For example, this # will ensure that the URL starts with 'http://mike-burns.com': # # add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*} def add_url_check(&block) @url_checks << block end # The Web is a graph; to avoid cycles we store the nodes (URLs) already # visited. The Web is a really, really, really big graph; as such, this list # of visited nodes grows really, really, really big. # # Change the object used to store these seen nodes with this. The default # object is an instance of Array. Available with Spider is a wrapper of # memcached. # # You can implement a custom class for this; any object passed to # check_already_seen_with must understand just << and included? . # # # default # check_already_seen_with Array.new # # # memcached # require 'spider/included_in_memcached' # check_already_seen_with IncludedInMemcached.new('localhost:11211') def check_already_seen_with(cacher) if cacher.respond_to?(:<<) && cacher.respond_to?(:include?) @seen = cacher else raise ArgumentError, 'expected something that responds to << and included?' end end # The Web is a really, really, really big graph; as such, this list # of nodes to visit grows really, really, really big. # # Change the object used to store nodes we have yet to walk. The default # object is an instance of Array. Available with Spider is a wrapper of # AmazonSQS. # # You can implement a custom class for this; any object passed to # check_already_seen_with must understand just push and pop . # # # default # store_next_urls_with Array.new # # # AmazonSQS # require 'spider/next_urls_in_sqs' # store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name) def store_next_urls_with(a_store) tmp_next_urls = @next_urls @next_urls = a_store tmp_next_urls.each do |a_url_hash| @next_urls.push a_url_hash end end # Add a response handler. A response handler's trigger can be :every, # :success, :failure, or any HTTP status code. The handler itself can be # either a Proc or a block. # # The arguments to the block are: the URL as a string, an instance of # Net::HTTPResponse, and the prior URL as a string. # # # For example: # # on 404 do |a_url, resp, prior_url| # puts "URL not found: #{a_url}" # end # # on :success do |a_url, resp, prior_url| # puts a_url # puts resp.body # end # # on :every do |a_url, resp, prior_url| # puts "Given this code: #{resp.code}" # end def on(code, p = nil, &block) f = p ? p : block case code when Fixnum @callbacks[code] = f else @callbacks[code.to_sym] = f end end # Run before the HTTP request. Given the URL as a string. # setup do |a_url| # headers['Cookies'] = 'user_id=1;admin=true' # end def setup(p = nil, &block) @setup = p ? p : block end # Run last, once for each page. Given the URL as a string. def teardown(p = nil, &block) @teardown = p ? p : block end # Use like a hash: # headers['Cookies'] = 'user_id=1;password=btrross3' def headers HeaderSetter.new(self) end def raw_headers #:nodoc: @headers end def raw_headers=(v) #:nodoc: @headers = v end # Reset the headers hash. def clear_headers @headers = {} end def start! #:nodoc: interrupted = false trap("SIGINT") { interrupted = true } begin next_urls = @next_urls.pop tmp_n_u = {} next_urls.each do |prior_url, urls| urls.map do |a_url| [a_url, (URI.parse(a_url) rescue nil)] end.select do |a_url, parsed_url| allowable_url?(a_url, parsed_url) end.each do |a_url, parsed_url| @setup.call(a_url) unless @setup.nil? get_page(parsed_url) do |response| do_callbacks(a_url, response, prior_url) #tmp_n_u[a_url] = generate_next_urls(a_url, response) #@next_urls.push tmp_n_u generate_next_urls(a_url, response).each do |a_next_url| @next_urls.push a_url => a_next_url end #exit if interrupted end @teardown.call(a_url) unless @teardown.nil? exit if interrupted end end end while !@next_urls.empty? end def success_or_failure(code) #:nodoc: if code > 199 && code < 300 :success else :failure end end def allowable_url?(a_url, parsed_url) #:nodoc: !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) && @url_checks.map{|url_check|url_check.call(a_url)}.all? end # True if the robots.txt for that URL allows access to it. def allowed?(a_url, parsed_url) # :nodoc: return false unless ['http','https'].include?(parsed_url.scheme) u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt" parsed_u = URI.parse(u) return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all? begin unless @robots_seen.include?(u) #open(u, 'User-Agent' => 'Ruby Spider', # 'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url| # @rules.parse(u, url.read) #end get_page(parsed_u) do |r| @rules.parse(u, r.body) end @robots_seen << u end @rules.allowed?(a_url) rescue OpenURI::HTTPError true # No robots.txt rescue Exception, Timeout::Error # to keep it from crashing false end end def get_page(parsed_url, &block) #:nodoc: @seen << parsed_url begin http = Net::HTTP.new(parsed_url.host, parsed_url.port) if parsed_url.scheme == 'https' http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_NONE end # Uses start because http.finish cannot be called. r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))} if r.redirect? get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block) else block.call(r) end rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e p e nil end end def do_callbacks(a_url, resp, prior_url) #:nodoc: cbs = [@callbacks[:every], resp.success? ? @callbacks[:success] : @callbacks[:failure], @callbacks[resp.code]] cbs.each do |cb| cb.call(a_url, resp, prior_url) if cb end end def generate_next_urls(a_url, resp) #:nodoc: web_page = resp.body base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten + [a_url[0,a_url.rindex('/')]])[0] base_url = remove_trailing_slash(base_url) web_page.scan(/href="(.*?)"/i).flatten.map do |link| begin parsed_link = URI.parse(link) if parsed_link.fragment == '#' nil else construct_complete_url(base_url, link, parsed_link) end rescue nil end end.compact end def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc: parsed_additional_url ||= URI.parse(additional_url) case parsed_additional_url.scheme when nil u = base_url.is_a?(URI) ? base_url : URI.parse(base_url) if additional_url[0].chr == '/' "#{u.scheme}://#{u.host}#{additional_url}" elsif u.path.nil? || u.path == '' "#{u.scheme}://#{u.host}/#{additional_url}" elsif u.path[0].chr == '/' "#{u.scheme}://#{u.host}#{u.path}/#{additional_url}" else "#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}" end else additional_url end end def remove_trailing_slash(s) #:nodoc: s.sub(%r{/*$},'') end class HeaderSetter #:nodoc: def initialize(si) @si = si end def []=(k,v) @si.raw_headers = @si.raw_headers.merge({k => v}) end end end ruby-spider-0.4.4/lib/spider/next_urls_in_sqs.rb0000664000000000000000000000544011205312536020445 0ustar rootroot# Use AmazonSQS to track nodes to visit. # # Copyright 2008 John Nagro # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name Mike Burns nor the # names of his contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. require 'rubygems' require 'right_aws' require 'yaml' # A specialized class using AmazonSQS to track nodes to walk. It supports # two operations: push and pop . Together these can be used to # add items to the queue, then pull items off the queue. # # This is useful if you want multiple Spider processes crawling the same # data set. # # To use it with Spider use the store_next_urls_with method: # # Spider.start_at('http://example.com/') do |s| # s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name) # end class NextUrlsInSQS # Construct a new NextUrlsInSQS instance. All arguments here are # passed to RightAWS::SqsGen2 (part of the right_aws gem) or used # to set the AmazonSQS queue name (optional). def initialize(aws_access_key, aws_secret_access_key, queue_name = 'ruby-spider') @sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key) @queue = @sqs.queue(queue_name) end # Pull an item off the queue, loop until data is found. Data is # encoded with YAML. def pop while true message = @queue.pop return YAML::load(message.to_s) unless message.nil? sleep 5 end end # Put data on the queue. Data is encoded with YAML. def push(a_msg) encoded_message = YAML::dump(a_msg) @queue.push(a_msg) end endruby-spider-0.4.4/lib/spider.rb0000664000000000000000000000511411205312536015044 0ustar rootroot# Copyright 2007-2008 Mike Burns & John Nagro # :include: README # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name Mike Burns nor the # names of his contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. require File.dirname(__FILE__)+'/spider/spider_instance' # A spidering library for Ruby. Handles robots.txt, scraping, finding more # links, and doing it all over again. class Spider # Runs the spider starting at the given URL. Also takes a block that is given # the SpiderInstance. Use the block to define the rules and handlers for # the discovered Web pages. See SpiderInstance for the possible rules and # handlers. # # Spider.start_at('http://mike-burns.com/') do |s| # s.add_url_check do |a_url| # a_url =~ %r{^http://mike-burns.com.*} # end # # s.on 404 do |a_url, resp, prior_url| # puts "URL not found: #{a_url}" # end # # s.on :success do |a_url, resp, prior_url| # puts "body: #{resp.body}" # end # # s.on :every do |a_url, resp, prior_url| # puts "URL returned anything: #{a_url} with this code #{resp.code}" # end # end def self.start_at(a_url, &block) rules = RobotRules.new('Ruby Spider 1.0') a_spider = SpiderInstance.new({nil => a_url}, [], rules, []) block.call(a_spider) a_spider.start! end end ruby-spider-0.4.4/README0000664000000000000000000000645711205312536013356 0ustar rootroot Spider, a Web spidering library for Ruby. It handles the robots.txt, scraping, collecting, and looping so that you can just handle the data. == Examples === Crawl the Web, loading each page in turn, until you run out of memory require 'spider' Spider.start_at('http://mike-burns.com/') {} === To handle erroneous responses require 'spider' Spider.start_at('http://mike-burns.com/') do |s| s.on :failure do |a_url, resp, prior_url| puts "URL failed: #{a_url}" puts " linked from #{prior_url}" end end === Or handle successful responses require 'spider' Spider.start_at('http://mike-burns.com/') do |s| s.on :success do |a_url, resp, prior_url| puts "#{a_url}: #{resp.code}" puts resp.body puts end end === Limit to just one domain require 'spider' Spider.start_at('http://mike-burns.com/') do |s| s.add_url_check do |a_url| a_url =~ %r{^http://mike-burns.com.*} end end === Pass headers to some requests require 'spider' Spider.start_at('http://mike-burns.com/') do |s| s.setup do |a_url| if a_url =~ %r{^http://.*wikipedia.*} headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" end end end === Use memcached to track cycles require 'spider' require 'spider/included_in_memcached' SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211'] Spider.start_at('http://mike-burns.com/') do |s| s.check_already_seen_with IncludedInMemcached.new(SERVERS) end === Track cycles with a custom object require 'spider' class ExpireLinks < Hash def <<(v) self[v] = Time.now end def include?(v) self[v].kind_of?(Time) && (self[v] + 86400) >= Time.now end end Spider.start_at('http://mike-burns.com/') do |s| s.check_already_seen_with ExpireLinks.new end === Store nodes to visit with Amazon SQS require 'spider' require 'spider/next_urls_in_sqs' Spider.start_at('http://mike-burns.com') do |s| s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY) end ==== Store nodes to visit with a custom object require 'spider' class MyArray < Array def pop super end def push(a_msg) super(a_msg) end end Spider.start_at('http://mike-burns.com') do |s| s.store_next_urls_with MyArray.new end === Create a URL graph require 'spider' nodes = {} Spider.start_at('http://mike-burns.com/') do |s| s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} } s.on(:every) do |a_url, resp, prior_url| nodes[prior_url] ||= [] nodes[prior_url] << a_url end end === Use a proxy require 'net/http_configuration' require 'spider' http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org', :proxy_port => 8881) http_conf.apply do Spider.start_at('http://img.4chan.org/b/') do |s| s.on(:success) do |a_url, resp, prior_url| File.open(a_url.gsub('/',':'),'w') do |f| f.write(resp.body) end end end end == Author John Nagro john.nagro@gmail.com Mike Burns http://mike-burns.com mike@mike-burns.com (original author) Many thanks to: Matt Horan Henri Cook Sander van der Vliet John Buckley Brian Campbell With `robot_rules' from James Edward Gray II via http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589