Update CHANGELOG.md

Whoops - wrong name
Add a platform value for the gemspec
2025-12-28 09:14:53 +00:00 · 2024-12-24 10:55:10 +11:00 · 2024-12-24 10:53:03 +11:00 · 2024-12-24 10:52:12 +11:00 · 2024-12-24 10:44:36 +11:00 · 2024-12-24 10:44:06 +11:00
6 changed files with 87 additions and 20 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,20 @@
 ## [Unreleased]
 ## [0.5.0] - 2024-12-24
 - Fix bug with Disallow rule containing empty line
 ## [0.4.0] - 2024-10-31
 - Ensure VERISON is available
 ## [0.3.0] - 2023-09-18
 - Only return unique sitemaps.
 ## [0.2.0] - 2023-09-10
 - Correctly handle multiple sitemaps + tests.
 ## [0.1.0] - 2023-09-09
 - Initial release
--- a/lib/probot.rb
+++ b/lib/probot.rb
@@ -2,6 +2,7 @@
 require "uri"
 require "net/http"
 require_relative "probot/version"
 # https://moz.com/learn/seo/robotstxt
 # https://stackoverflow.com/questions/45293419/order-of-directives-in-robots-txt-do-they-overwrite-each-other-or-complement-ea
@@ -19,8 +20,8 @@ require "net/http"
 #   Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity.
 class Probot
-  attr_reader :rules, :sitemap, :doc
+  attr_reader :rules, :doc
-  attr_accessor :agent
+  attr_accessor :agent, :sitemaps, :site
  def initialize(data, agent: "*")
    raise ArgumentError, "The first argument must be a string" unless data.is_a?(String)
@@ -31,7 +32,8 @@ class Probot
    @current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} }
    @sitemaps = []
-    @doc = data.start_with?("http") ? fetch_robots_txt(data) : data
+    @site = URI(data) if data.start_with?("http")
    @doc = @site.nil? ? data : fetch_robots_txt(@site)
    parse(@doc)
  end
@@ -90,11 +92,13 @@ class Probot
      end
      # All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt
      if data.allow? || data.disallow?
-        @current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) }
+        @current_agents.each do |agent|
          rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) unless data.value.nil?
        end
-        subsequent_agent = false # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay.
+        # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay.
        subsequent_agent = false
        next
      end
@@ -103,8 +107,12 @@ class Probot
        next
      end
      # Ensure we have an absolute URL
      if data.sitemap?
-        @sitemap = URI(data.value).path
+        sitemap_uri = URI(data.value)
        sitemap_uri = sitemap_uri.host.nil? ? URI.join(*[site, sitemap_uri].compact) : sitemap_uri
        @sitemaps << sitemap_uri.to_s
        @sitemaps.uniq!
        next
      end
@@ -123,6 +131,8 @@ class Probot
    def clean_value = raw_value.split("#").first&.strip
    def clean_url = clean_value&.then { URI(_1).to_s }
    def agent? = key == "user-agent"
    def disallow? = key == "disallow"
@@ -135,11 +145,13 @@ class Probot
    def value
      return clean_value.to_f if crawl_delay?
-      return URI(clean_value).to_s if disallow? || allow?
+      return clean_url if disallow? || allow?
      raw_value
    rescue URI::InvalidURIError
      raw_value
    rescue ArgumentError
      raw_value
    end
  end
--- a/lib/probot/version.rb
+++ b/lib/probot/version.rb
@@ -1,3 +1,3 @@
 class Probot
-  VERSION = "0.1.0"
+  VERSION = "0.5.0"
 end
--- a/probot.gemspec
+++ b/probot.gemspec
@@ -13,6 +13,7 @@ Gem::Specification.new do |spec|
  spec.homepage = "http://github.com/dkam/probot"
  spec.license = "MIT"
  spec.required_ruby_version = ">= 3.0"
  spec.platform = Gem::Platform::RUBY
  spec.metadata["homepage_uri"] = spec.homepage
  spec.metadata["source_code_uri"] = "http://github.com/dkam/probot"
@@ -29,10 +30,5 @@ Gem::Specification.new do |spec|
  spec.bindir = "exe"
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
  spec.require_paths = ["lib"]
-
+  spec.add_development_dependency "debug"
  # Uncomment to register a new dependency of your gem
  # spec.add_dependency "example-gem", "~> 1.0"
  # For more information and examples about making a new gem, check out our
  # guide at: https://bundler.io/guides/creating_gem.html
 end
--- a/test/test_helper.rb
+++ b/test/test_helper.rb
@@ -2,6 +2,5 @@
 $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
 require "probot"
 require "probot/version" # for testing the version number - otherwise the gemspec does it.
 require "minitest/autorun"
--- a/test/test_probot.rb
+++ b/test/test_probot.rb
@@ -25,7 +25,7 @@ class TestProbot < Minitest::Test
      Disallow: /noblah/
      Allow: /cart/
      ),
-      sitemap: "/sitemap.xml",
+      sitemaps: ["http://www.allenandunwin.com/sitemap.xml"],
      found_agents: ["*", "FooBot", "BlahBot", "YadaBot"],
      tests: [
        {
@@ -51,7 +51,7 @@ class TestProbot < Minitest::Test
      Allow: /    # comment
      Sitemap: http://example.com/sitemap.xml
      ),
-      sitemap: "/sitemap.xml",
+      sitemaps: ["http://example.com/sitemap.xml"],
      found_agents: ["*"],
      tests: [
        {
@@ -85,7 +85,7 @@ class TestProbot < Minitest::Test
      sitemap: /sitemapxml.xml
      ),
-      sitemap: "/sitemapxml.xml",
+      sitemaps: ["/sitemapxml.xml"],
      found_agents: ["*", "rubytest"],
      tests: [
        {
@@ -95,6 +95,19 @@ class TestProbot < Minitest::Test
          crawl_delay: 0
        }
      ]
    },
    {
      txt: %("User-agent: *\nDisallow: /wp/wp-admin/\nAllow: /wp/wp-admin/admin-ajax.php\n\nUser-agent: *\nDisallow: /wp-content/uploads/wpo/wpo-plugins-tables-list.json\n\n# START YOAST BLOCK\n# ---------------------------\nUser-agent: *\nDisallow:\n\nSitemap: https://prhinternationalsales.com/sitemap_index.xml\n# ---------------------------\n# END YOAST BLOCK"),
      sitemaps: ["https://prhinternationalsales.com/sitemap_index.xml"],
      found_agents: ["*"],
      tests: [
        {
          agent: "*",
          allowed: ["/wp/wp-admin/admin-ajax.php"],
          disallowed: ["/wp/wp-admin/", "/wp-content/uploads/wpo/wpo-plugins-tables-list.json"],
          crawl_delay: 0
        }
      ]
    }
  ].freeze
@@ -103,7 +116,7 @@ class TestProbot < Minitest::Test
      r = Probot.new(test_case[:txt])
      assert_equal test_case[:found_agents], r.found_agents, "found_agents for test #{ind}"
-      assert_equal test_case[:sitemap], r.sitemap, "sitemap for test #{ind}"
+      assert_equal test_case[:sitemaps], r.sitemaps, "sitemap for test #{ind}"
      test_case[:tests].each do |tst|
        r = Probot.new(test_case[:txt], agent: tst[:agent])
@@ -131,7 +144,9 @@ class TestProbot < Minitest::Test
  def test_empty_allow_disallow
    assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty?
    assert Probot.new(%(User-agent: *\nAllow:\n\n)).rules.dig("*", "allow").empty?
    assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty?
    assert Probot.new(%(User-agent: *\nDisallow:\n\n)).rules.dig("*", "disallow").empty?
  end
  def test_consecutive_user_agents
@@ -167,4 +182,34 @@ class TestProbot < Minitest::Test
    assert r.allowed?("/dir/page") == true
    assert r.allowed?("/dir/page?var") == false
  end
  def test_multiple_sitemaps
    txt = %(User-agent: *\nSitemap: https://example.com/sitemapxml.xml\nSitemap: https://example.com/sitemapxml2.xml\n\n)
    r = Probot.new(txt)
    assert_equal 2, r.sitemaps.length
    assert r.sitemaps.include?("https://example.com/sitemapxml.xml")
    assert r.sitemaps.include?("https://example.com/sitemapxml2.xml")
  end
  # Sitemaps should be absolute URLs, but we'll accept relative URLs and make them absolute.
  # However, we need to test both scenarios - when we know the site, and when we don't because we're parsing a robots.txt file.
  # This test is a little gross, reaching into the guts of the class, but it's the easiest way to test this.
  def test_absolute_sitemaps
    txt = %(User-agent: *\nSitemap: /sitemapxml.xml\nSitemap: /sitemapxml2.xml\n\n)
    r = Probot.new(txt)
    assert_equal 2, r.sitemaps.length
    assert r.sitemaps.include?("/sitemapxml.xml"), "expected /sitemapxml.xml, got #{r.sitemaps}"
    assert r.sitemaps.include?("/sitemapxml2.xml"), "expected /sitemapxml2.xml, got #{r.sitemaps}"
    # We have to manually set the site, as we're not parsing a URL - then we need to reset the sitemaps array and reparse the doc. Gross.
    r = Probot.new(txt)
    r.site = URI("https://example.com")
    r.sitemaps = []
    r.parse(r.doc)
    assert_equal 2, r.sitemaps.length
    assert r.sitemaps.include?("https://example.com/sitemapxml.xml"), "expected https://example.com/sitemapxml.xml, got #{r.sitemaps}"
    assert r.sitemaps.include?("https://example.com/sitemapxml2.xml"), "expected https://example.com/sitemapxml2.xml, got #{r.sitemaps}"
  end
 end
Author	SHA1	Message	Date
Dan Milne	45c1b001cd	Update CHANGELOG.md	2024-12-24 10:55:10 +11:00
Dan Milne	f1a0b74a97	Whoops - wrong name	2024-12-24 10:53:03 +11:00
Dan Milne	2e91518fd6	Add a platform value for the gemspec	2024-12-24 10:52:12 +11:00
Dan Milne	c4e1b876ce	Version bump	2024-12-24 10:44:36 +11:00
Dan Milne	a7291bdfc3	add test to ensure allowed clause with value also works	2024-12-24 10:44:06 +11:00
Dan Milne	88c7dc67f2	Add code and tests to handle a disallowed clause with value	2024-12-24 10:43:17 +11:00
Dan Milne	36b6a29039	Update change log	2024-10-31 08:16:11 +11:00
Dan Milne	89432b2dac	Fix up the version constant	2024-10-31 08:06:09 +11:00
Dan Milne	ad48a4e335	Bump version	2023-09-18 11:05:45 +10:00
Dan Milne	fea1e2009a	Sitemaps should be unique	2023-09-18 11:05:21 +10:00
Dan Milne	c700c09021	Bump the version	2023-09-10 13:18:11 +10:00
Dan Milne	71bbd4d1ad	Support for mulitple sitemaps. Sitemaps should be absolute, so we'll try and make them absolute - but if we're passed the Robots.txt as text we can't determine the host.	2023-09-10 13:17:53 +10:00