commit 5774ca2836c5a92bd60ef5933cf69ab3a538384f Author: Dan Milne Date: Sun Sep 10 10:02:52 2023 +1000 Switch to Net::HTTP and set the request header to match the query header diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9106b2a --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/.bundle/ +/.yardoc +/_yardoc/ +/coverage/ +/doc/ +/pkg/ +/spec/reports/ +/tmp/ diff --git a/.standard.yml b/.standard.yml new file mode 100644 index 0000000..08d0e90 --- /dev/null +++ b/.standard.yml @@ -0,0 +1,3 @@ +# For available configuration options, see: +# https://github.com/testdouble/standard +ruby_version: 3.0.0 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..5ba7c3c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +## [Unreleased] + +## [0.1.0] - 2023-09-09 + +- Initial release diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..52d485e --- /dev/null +++ b/Gemfile @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +# Specify your gem's dependencies in Probot.gemspec +gemspec + +gem "rake", "~> 13.0" +gem "minitest", "~> 5.0" +gem "rubocop", "~> 1.21" +gem "standard", "~> 1.31" # Adjust the version as needed diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..d0ccb6b --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2023 Dan Milne + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..b12ab49 --- /dev/null +++ b/README.md @@ -0,0 +1,90 @@ +# Probot + +OMG another Ruby Robot.txt parser? It was an accident, I didn't mean to make it and I shouldn't have but here we are. It started out tiny and grew. Yes I should have used one of the other gems. + +Does this even deserve a gem? Feel free to just copy and paste the single file which implements this - one less dependency eh? + +On the plus side, it has some nice features I don't think the others have. + +1. Supports consecutive user agents making up a single record: + +```txt +# Block both first-agent and second-agent from the site. +User-agent: first-agent +User-agent: second-agent +Disallow: / +``` + +2. It can select the most specific allow / disallow rule, using rule length as a proxy for specificity. You can also ask it to show you the matching rules and their scores. + +```ruby +txt = %Q{ +User-agent: * +Disallow: /dir1 +Allow: /dir1/dir2 +Disallow: /dir1/dir2/dir3 +} +Probot.new(txt).matches("/dir1/dir2/dir3") +=> {:disallowed=>{/\/dir1/=>5, /\/dir1\/dir2\/dir3/=>15}, :allowed=>{/\/dir1\/dir2/=>10}} +``` + +In this case, we can see the Disallow rule with length 15 would be followed. + +## Installation + +Install the gem and add to the application's Gemfile by executing: + + $ bundle add probot + +If bundler is not being used to manage dependencies, install the gem by executing: + + $ gem install probot + +## Usage + +It's straightforward to use. Instantiate it if you'll make a few requests: + +```ruby +> r = Probot.new('https://booko.info', agent: 'MyAgent') +> r.rules +=> {"*"=>{"disallow"=>[/\/search/, /\/products\/search/, /\/.*\/refresh_prices/, /\/.*\/add_to_cart/, /\/.*\/get_prices/, /\/lists\/add/, /\/.*\/add$/, /\/api\//, /\/users\/bits/, /\/users\/create/, /\/prices\//, /\/widgets\/issue/], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>0.1}, + "YandexBot"=>{"disallow"=>[], "allow"=>[], "crawl_delay"=>0, "crawl-delay"=>300.0}} + +> r.allowed?("/abc/add_to_cart") +=> false +> r.allowed?("https://booko.info/9780765397522/All-Systems-Red") +=> true +> r.allowed?("https://booko.info/9780765397522/add_to_cart") +=> false +``` + +Or just one-shot it for one-offs: + +```ruby +Probot.allowed?("https://booko.info/9780765397522/All-Systems-Red", agent: "BookScraper") +``` + + +## Development + +After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. + +To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org). + +## Contributing + +Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/Probot. + +## Further Reading + +* https://moz.com/learn/seo/robotstxt +* https://stackoverflow.com/questions/45293419/order-of-directives-in-robots-txt-do-they-overwrite-each-other-or-complement-ea +* https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt +* https://developers.google.com/search/docs/crawling-indexing/robots/create-robots-txt + +* https://github.com/google/robotstxt - Google's official parser + + +## License + +The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..5bb6087 --- /dev/null +++ b/Rakefile @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +require "bundler/gem_tasks" +require "rake/testtask" + +Rake::TestTask.new(:test) do |t| + t.libs << "test" + t.libs << "lib" + t.test_files = FileList["test/**/test_*.rb"] +end + +require "standard/rake" + +task default: %i[test standard] diff --git a/bin/console b/bin/console new file mode 100755 index 0000000..e76ba34 --- /dev/null +++ b/bin/console @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "probot" + +# You can add fixtures and/or initialization code here to make experimenting +# with your gem easier. You can also use a different console, if you like. + +require "irb" +IRB.start(__FILE__) diff --git a/bin/setup b/bin/setup new file mode 100755 index 0000000..dce67d8 --- /dev/null +++ b/bin/setup @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n\t' +set -vx + +bundle install + +# Do any other automated setup that you need to do here diff --git a/lib/probot.rb b/lib/probot.rb new file mode 100644 index 0000000..08db6ce --- /dev/null +++ b/lib/probot.rb @@ -0,0 +1,159 @@ +# frozen_string_literal: true + +require_relative "Probot/version" + +require "uri" +require "net/http" + +# https://moz.com/learn/seo/robotstxt +# https://stackoverflow.com/questions/45293419/order-of-directives-in-robots-txt-do-they-overwrite-each-other-or-complement-ea +# https://developers.google.com/search/docs/crawling-indexing/robots/create-robots-txt +# https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt +# +# https://github.com/google/robotstxt - Google's official parser + +# Note: User-agent found on consecutive lines are considered to be part of the same record. +# Note: Google ignores crawl_delay +# Note: Google does not consider crawl_delay or sitemap to be part of the per-agent records. + +# Two main parts of this class: +# Parse a robots.txt file +# Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity. + +class Robots + attr_reader :rules, :sitemap, :doc + attr_accessor :agent + + def initialize(data, agent: "*") + raise ArgumentError, "The first argument must be a string" unless data.is_a?(String) + @agent = agent + + @rules = {} + @current_agents = ["*"] + @current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} } + @sitemaps = [] + + @doc = data.start_with?("http") ? fetch_robots_txt(data) : data + parse(@doc) + end + + def request_headers = (agent == "*") ? {} : {"User-Agent" => @agent} + + def fetch_robots_txt(url) + Net::HTTP.get(URI(url).tap { |u| u.path = "/robots.txt" }, request_headers) + rescue + "" + end + + def crawl_delay = rules.dig(@agent, "crawl_delay") + + def found_agents = rules.keys + + def disallowed = rules.dig(@agent, "disallow") || rules.dig("*", "disallow") + + def allowed = rules.dig(@agent, "allow") || rules.dig("*", "allow") + + def disallowed_matches(url) = disallowed.select { |disallowed_url| url.match?(disallowed_url) }.to_h { |rule| [rule, pattern_length(rule)] } + + def allowed_matches(url) = allowed.select { |allowed_url| url.match?(allowed_url) }.to_h { |rule| [rule, pattern_length(rule)] } + + def matches(url) = {disallowed: disallowed_matches(url), allowed: allowed_matches(url)} + + def disallowed_best(url) = disallowed_matches(url).max_by { |k, v| v } + + def allowed_best(url) = allowed_matches(url).max_by { |k, v| v } + + def matching_rule(url) = (disallowed_best(url)&.last.to_i > allowed_best(url)&.last.to_i) ? {disallow: disallowed_best(url)&.first} : {allow: allowed_best(url)&.first} + + # If a URL is not disallowed, it is allowed - so we check if it is explictly disallowed and if not, it's allowed. + def allowed?(url) = !disallowed?(url) + + def disallowed?(url) = matching_rule(url)&.keys&.first == :disallow + + def parse(doc) + # We need to handle consective user-agent lines, which are considered to be part of the same record. + subsequent_agent = false + + doc.lines.each do |line| + next if line.start_with?("#") || !line.include?(":") || line.split(":").length < 2 + + data = ParsedLine.new(line) + + if data.agent? + if subsequent_agent + @current_agents << data.value + else + @current_agents = [data.value] + subsequent_agent = true + end + + @current_agents.each { |agent| rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} } + next + end + + # All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt + + if data.allow? || data.disallow? + @current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) } + + subsequent_agent = false # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay. + next + end + + if data.crawl_delay? + @current_agents.each { |agent| rules[agent][data.key] = data.value } + next + end + + if data.sitemap? + @sitemap = URI(data.value).path + next + end + + @current_agents.each { |agent| rules[agent][data.key] = data.value } + end + end + + def pattern_length(regexp) = regexp.source.gsub(/(\\[\*\$\.])/, "*").length + + # ParedLine Note: In the case of 'Sitemap: https://example.com/sitemap.xml', raw_value needs to rejoin after splitting the URL. + + ParsedLine = Struct.new(:input_string) do + def key = input_string.split(":").first&.strip&.downcase + + def raw_value = input_string.split(":").slice(1..)&.join(":")&.strip + + def clean_value = raw_value.split("#").first&.strip + + def agent? = key == "user-agent" + + def disallow? = key == "disallow" + + def allow? = key == "allow" + + def crawl_delay? = key == "crawl-delay" + + def sitemap? = key == "sitemap" + + def value + return clean_value.to_f if crawl_delay? + return URI(clean_value).to_s if disallow? || allow? + + raw_value + rescue URI::InvalidURIError + raw_value + end + end + + def self.allowed?(url, agent: "*") = Robots.new(url, agent: agent).allowed?(url) +end + +# Robots.allowed?("https://booko.info/9780765397522/All-Systems-Red") +# => true +# r = Robots.new('https://booko.info', agent: 'YandexBot') +# r = Robots.new('https://www.allenandunwin.com') +# $ Robots.new('https://www.amazon.com/').matches("/gp/wishlist/ipad-install/gcrnsts") +# => {:disallowed=>{/\/wishlist\//=>10, /\/gp\/wishlist\//=>13, /.*\/gcrnsts/=>10}, :allowed=>{/\/gp\/wishlist\/ipad\-install.*/=>28}} +# +# Test with +# assert Robots.new(nil, doc: %Q{allow: /$\ndisallow: /}).matching_rule('https://example.com/page.htm') == {disallow: /\//} diff --git a/lib/probot/version.rb b/lib/probot/version.rb new file mode 100644 index 0000000..19d16d3 --- /dev/null +++ b/lib/probot/version.rb @@ -0,0 +1,3 @@ +class Probot + VERSION = "0.1.0" +end diff --git a/probot.gemspec b/probot.gemspec new file mode 100644 index 0000000..fc72399 --- /dev/null +++ b/probot.gemspec @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +require_relative "lib/Probot/version" + +Gem::Specification.new do |spec| + spec.name = "Probot" + spec.version = Probot::VERSION + spec.authors = ["Dan Milne"] + spec.email = ["d@nmilne.com"] + + spec.summary = "A Robots.txt parser." + spec.description = "A more fully featured Robotos.txt parser." + spec.homepage = "http://github.com/dkam/probot" + spec.license = "MIT" + spec.required_ruby_version = ">= 3.0" + + spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'" + + spec.metadata["homepage_uri"] = spec.homepage + spec.metadata["source_code_uri"] = "http://github.com/dkam/probot" + spec.metadata["changelog_uri"] = "http://github.com/dkam/probot/CHANGELOG.md" + + # Specify which files should be added to the gem when it is released. + # The `git ls-files -z` loads the files in the RubyGem that have been added into git. + spec.files = Dir.chdir(__dir__) do + `git ls-files -z`.split("\x0").reject do |f| + (File.expand_path(f) == __FILE__) || + f.start_with?(*%w[bin/ test/ spec/ features/ .git .circleci appveyor Gemfile]) + end + end + spec.bindir = "exe" + spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } + spec.require_paths = ["lib"] + + # Uncomment to register a new dependency of your gem + # spec.add_dependency "example-gem", "~> 1.0" + + # For more information and examples about making a new gem, check out our + # guide at: https://bundler.io/guides/creating_gem.html +end diff --git a/sig/probot.rbs b/sig/probot.rbs new file mode 100644 index 0000000..fdb8173 --- /dev/null +++ b/sig/probot.rbs @@ -0,0 +1,4 @@ +module Probot + VERSION: String + # See the writing guide of rbs: https://github.com/ruby/rbs#guides +end diff --git a/test/test_helper.rb b/test/test_helper.rb new file mode 100644 index 0000000..2abc2ba --- /dev/null +++ b/test/test_helper.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +$LOAD_PATH.unshift File.expand_path("../lib", __dir__) +require "probot" + +require "minitest/autorun" diff --git a/test/test_probot.rb b/test/test_probot.rb new file mode 100644 index 0000000..5c8900d --- /dev/null +++ b/test/test_probot.rb @@ -0,0 +1,170 @@ +# frozen_string_literal: true + +require "test_helper" + +class TestProbot < Minitest::Test + def test_that_it_has_a_version_number + refute_nil ::Probot::VERSION + end + + TEST_CASES = [ + { + txt: %( + User-Agent: * + Disallow : /admin/ + Disallow : /cart/ + Disallow : /client/ + Sitemap: http://www.allenandunwin.com/sitemap.xml + + User-Agent: FooBot + Disallow: /private/ + Allow: /cart/ + + User-Agent: BlahBot + User-Agent: YadaBot + Disallow: /noblah/ + Allow: /cart/ + ), + sitemap: "/sitemap.xml", + found_agents: ["*", "FooBot", "BlahBot", "YadaBot"], + tests: [ + { + agent: "*", + allowed: ["/books/9781760878854", "/books/9781760878861", "/books/9781760878878"], + disallowed: ["/admin/", "/cart/", "/client/"], + crawl_delay: 0 + } + ] + }, { + txt: %( + User-agent: * + Disallow: /?*\t\t\t#comment + Disallow: /home/ + Disallow: /dashboard + Disallow: /terms-conditions + Disallow: /privacy-policy + Disallow: /index.php + Disallow: /chargify_system + Disallow: /test* + Disallow: /team* # comment + Disallow: /index + Allow: / # comment + Sitemap: http://example.com/sitemap.xml + ), + sitemap: "/sitemap.xml", + found_agents: ["*"], + tests: [ + { + agent: "*", + allowed: ["/home", "/books/9781760878878", "/client/"], + disallowed: ["/home/", "/dashboard", "/test/hello", "/team/", "/team/1", "/teamtest"], + crawl_delay: 0 + }, + { + agent: "UnfoundAgent", + allowed: ["/home", "/books/9781760878878", "/client/"], + disallowed: ["/home/", "/dashboard", "/test/hello", "/team/", "/team/1", "/teamtest"], + crawl_delay: 0 + } + ] + }, + # These tests from https://github.com/rinzi/robotstxt + { + txt: %(User-agent: rubytest + Disallow: /no-dir/ + Disallow: /no-page.php + Disallow: /*-no-dir/ + Disallow: /dir/*.php + Disallow: *?var + Disallow: /dir/*?var + + # this is a test + useragent: * + disalow: /test/ + + sitemap: /sitemapxml.xml + + ), + sitemap: "/sitemapxml.xml", + found_agents: ["*", "rubytest"], + tests: [ + { + agent: "rubytest", + allowed: ["/", "/blog/", "/blog/page.php"], + disallowed: ["/no-dir/", "/foo-no-dir/", "/foo-no-dir/page.html", "/dir/page.php", "/page.php?var=0", "/dir/page.php?var=0", "/blog/page.php?var=0"], + crawl_delay: 0 + } + ] + } + ].freeze + + def test_some_tests + TEST_CASES.each_with_index do |test_case, ind| + r = Robots.new(test_case[:txt]) + + assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}" + assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}" + + test_case[:tests].each do |tst| + r = Robots.new(test_case[:txt], agent: tst[:agent]) + + tst[:allowed].each do |url| + assert r.allowed?(url), "expected #{url} to be allowed, for agent #{tst[:agent]} | test #{ind}" + end + + tst[:disallowed].each do |url| + assert r.disallowed?(url), "expected #{url} to be disallowed, for agent #{tst[:agent]} | test #{ind}" + end + end + end + end + + # https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt#url-matching-based-on-path-values + def test_googles_tests + assert Robots.new(%(allow: /p\ndisallow: /)).matching_rule("https://example.com/page") == {allow: /\/p/} + assert Robots.new(%(allow: /folder\ndisallow: /folder)).matching_rule("https://example.com/folder/page") == {allow: /\/folder/} + assert Robots.new(%(allow: /page\ndisallow: /*.htm)).matching_rule("https://example.com/page.htm") == {disallow: /\/.*\.htm/} + assert Robots.new(%(allow: /page\ndisallow: /*.ph)).matching_rule("https://example.com/page.php5") == {disallow: /\/.*\.ph/} # FAIL + assert Robots.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/") == {allow: /\/$/} + assert Robots.new(%(allow: /$\ndisallow: /)).matching_rule("https://example.com/page.htm") == {disallow: /\//} + end + + def test_empty_allow_disallow + assert Robots.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty? + assert Robots.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty? + end + + def test_consecutive_user_agents + txt = %(User-agent: Curl + User-agent: Wget + Disallow: /url) + r = Robots.new(txt) + assert r.allowed?("/url") == true + + r.agent = "Curl" + assert r.allowed?("/url") == false + + r.agent = "Wget" + assert r.allowed?("/url") == false + + r.agent = "Other" + assert r.allowed?("/url") == true + end + + def test_unfound_robots + r = Robots.new("") + assert r.allowed?("/url") == true + r.agent = "Curl" + assert r.allowed?("/url") == true + end + + def test_more_other_tests + txt = %(User-agent: rubytest\nDisallow: /no-dir/\nDisallow: /no-page.php\nDisallow: /*-no-dir/\nDisallow: /dir/*.php\nDisallow: *?var\nDisallow: /dir/*?var\n\n# this is a test\nuseragent: *\ndisalow: /test/\n\nsitemap: /sitemapxml.xml\n\n ) + + r = Robots.new(txt, agent: "rubytest") + assert r.allowed?("/dir/page.php") == false + assert r.allowed?("/dir/home.php") == false + assert r.allowed?("/dir/page") == true + assert r.allowed?("/dir/page?var") == false + end +end