Compare commits

..

8 Commits

Author SHA1 Message Date
Dan Milne
45c1b001cd Update CHANGELOG.md 2024-12-24 10:55:10 +11:00
Dan Milne
f1a0b74a97 Whoops - wrong name 2024-12-24 10:53:03 +11:00
Dan Milne
2e91518fd6 Add a platform value for the gemspec 2024-12-24 10:52:12 +11:00
Dan Milne
c4e1b876ce Version bump 2024-12-24 10:44:36 +11:00
Dan Milne
a7291bdfc3 add test to ensure allowed clause with value also works 2024-12-24 10:44:06 +11:00
Dan Milne
88c7dc67f2 Add code and tests to handle a disallowed clause with value 2024-12-24 10:43:17 +11:00
Dan Milne
36b6a29039 Update change log 2024-10-31 08:16:11 +11:00
Dan Milne
89432b2dac Fix up the version constant 2024-10-31 08:06:09 +11:00
6 changed files with 36 additions and 4 deletions

View File

@@ -1,5 +1,13 @@
## [Unreleased] ## [Unreleased]
## [0.5.0] - 2024-12-24
- Fix bug with Disallow rule containing empty line
## [0.4.0] - 2024-10-31
- Ensure VERISON is available
## [0.3.0] - 2023-09-18 ## [0.3.0] - 2023-09-18
- Only return unique sitemaps. - Only return unique sitemaps.

View File

@@ -2,6 +2,7 @@
require "uri" require "uri"
require "net/http" require "net/http"
require_relative "probot/version"
# https://moz.com/learn/seo/robotstxt # https://moz.com/learn/seo/robotstxt
# https://stackoverflow.com/questions/45293419/order-of-directives-in-robots-txt-do-they-overwrite-each-other-or-complement-ea # https://stackoverflow.com/questions/45293419/order-of-directives-in-robots-txt-do-they-overwrite-each-other-or-complement-ea
@@ -30,6 +31,7 @@ class Probot
@current_agents = ["*"] @current_agents = ["*"]
@current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} } @current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} }
@sitemaps = [] @sitemaps = []
@site = URI(data) if data.start_with?("http") @site = URI(data) if data.start_with?("http")
@doc = @site.nil? ? data : fetch_robots_txt(@site) @doc = @site.nil? ? data : fetch_robots_txt(@site)
parse(@doc) parse(@doc)
@@ -91,7 +93,9 @@ class Probot
# All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt # All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt
if data.allow? || data.disallow? if data.allow? || data.disallow?
@current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) } @current_agents.each do |agent|
rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) unless data.value.nil?
end
# When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay. # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay.
subsequent_agent = false subsequent_agent = false
@@ -127,6 +131,8 @@ class Probot
def clean_value = raw_value.split("#").first&.strip def clean_value = raw_value.split("#").first&.strip
def clean_url = clean_value&.then { URI(_1).to_s }
def agent? = key == "user-agent" def agent? = key == "user-agent"
def disallow? = key == "disallow" def disallow? = key == "disallow"
@@ -139,11 +145,13 @@ class Probot
def value def value
return clean_value.to_f if crawl_delay? return clean_value.to_f if crawl_delay?
return URI(clean_value).to_s if disallow? || allow? return clean_url if disallow? || allow?
raw_value raw_value
rescue URI::InvalidURIError rescue URI::InvalidURIError
raw_value raw_value
rescue ArgumentError
raw_value
end end
end end

View File

@@ -1,3 +1,3 @@
class Probot class Probot
VERSION = "0.3.0" VERSION = "0.5.0"
end end

View File

@@ -13,6 +13,7 @@ Gem::Specification.new do |spec|
spec.homepage = "http://github.com/dkam/probot" spec.homepage = "http://github.com/dkam/probot"
spec.license = "MIT" spec.license = "MIT"
spec.required_ruby_version = ">= 3.0" spec.required_ruby_version = ">= 3.0"
spec.platform = Gem::Platform::RUBY
spec.metadata["homepage_uri"] = spec.homepage spec.metadata["homepage_uri"] = spec.homepage
spec.metadata["source_code_uri"] = "http://github.com/dkam/probot" spec.metadata["source_code_uri"] = "http://github.com/dkam/probot"
@@ -29,4 +30,5 @@ Gem::Specification.new do |spec|
spec.bindir = "exe" spec.bindir = "exe"
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
spec.require_paths = ["lib"] spec.require_paths = ["lib"]
spec.add_development_dependency "debug"
end end

View File

@@ -2,6 +2,5 @@
$LOAD_PATH.unshift File.expand_path("../lib", __dir__) $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
require "probot" require "probot"
require "probot/version" # for testing the version number - otherwise the gemspec does it.
require "minitest/autorun" require "minitest/autorun"

View File

@@ -95,6 +95,19 @@ class TestProbot < Minitest::Test
crawl_delay: 0 crawl_delay: 0
} }
] ]
},
{
txt: %("User-agent: *\nDisallow: /wp/wp-admin/\nAllow: /wp/wp-admin/admin-ajax.php\n\nUser-agent: *\nDisallow: /wp-content/uploads/wpo/wpo-plugins-tables-list.json\n\n# START YOAST BLOCK\n# ---------------------------\nUser-agent: *\nDisallow:\n\nSitemap: https://prhinternationalsales.com/sitemap_index.xml\n# ---------------------------\n# END YOAST BLOCK"),
sitemaps: ["https://prhinternationalsales.com/sitemap_index.xml"],
found_agents: ["*"],
tests: [
{
agent: "*",
allowed: ["/wp/wp-admin/admin-ajax.php"],
disallowed: ["/wp/wp-admin/", "/wp-content/uploads/wpo/wpo-plugins-tables-list.json"],
crawl_delay: 0
}
]
} }
].freeze ].freeze
@@ -131,7 +144,9 @@ class TestProbot < Minitest::Test
def test_empty_allow_disallow def test_empty_allow_disallow
assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty? assert Probot.new(%(User-agent: *\nAllow:)).rules.dig("*", "allow").empty?
assert Probot.new(%(User-agent: *\nAllow:\n\n)).rules.dig("*", "allow").empty?
assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty? assert Probot.new(%(User-agent: *\nDisallow:)).rules.dig("*", "disallow").empty?
assert Probot.new(%(User-agent: *\nDisallow:\n\n)).rules.dig("*", "disallow").empty?
end end
def test_consecutive_user_agents def test_consecutive_user_agents