579 lines
18 KiB
Ruby
579 lines
18 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# BotNetworkRangeImporter - Service for importing official bot network ranges
|
|
#
|
|
# Imports network ranges from official bot provider sources like:
|
|
# - Amazon AWS: https://ip-ranges.amazonaws.com/ip-ranges.json
|
|
# - Google: Official crawler IP lists
|
|
# - Microsoft/Bing: Bot network ranges
|
|
# - Anthropic: Service network ranges
|
|
# - OpenAI: Service network ranges
|
|
class BotNetworkRangeImporter
|
|
class ImportError < StandardError; end
|
|
|
|
# Official sources for bot network ranges
|
|
BOT_SOURCES = {
|
|
amazon_aws: {
|
|
name: 'Amazon AWS',
|
|
url: 'https://ip-ranges.amazonaws.com/ip-ranges.json',
|
|
format: :json,
|
|
parser: :parse_aws_ranges,
|
|
description: 'Official AWS IP ranges including Amazonbot and other services'
|
|
},
|
|
google: {
|
|
name: 'Google',
|
|
# Note: These URLs may need to be updated based on current Google documentation
|
|
urls: [
|
|
'https://developers.google.com/search/docs/files/googlebot.json',
|
|
'https://developers.google.com/search/docs/files/special-crawlers.json'
|
|
],
|
|
format: :json,
|
|
parser: :parse_google_ranges,
|
|
description: 'Googlebot and other Google crawler IP ranges'
|
|
},
|
|
microsoft_bing: {
|
|
name: 'Microsoft Bing',
|
|
# Note: Microsoft may require web scraping or API access
|
|
url: 'https://www.bing.com/toolbox/bingbot.json',
|
|
format: :json,
|
|
parser: :parse_microsoft_ranges,
|
|
description: 'Bingbot and other Microsoft crawler IP ranges'
|
|
},
|
|
anthropic: {
|
|
name: 'Anthropic Claude',
|
|
# Note: Anthropic ranges may need manual updates or different approach
|
|
url: 'https://docs.anthropic.com/claude/reference/ip_ranges',
|
|
format: :html,
|
|
parser: :parse_anthropic_ranges,
|
|
description: 'Anthropic Claude API service IP ranges'
|
|
},
|
|
openai_searchbot: {
|
|
name: 'OpenAI SearchBot',
|
|
url: 'https://openai.com/searchbot.json',
|
|
format: :json,
|
|
parser: :parse_openai_ranges,
|
|
description: 'OpenAI SearchBot for ChatGPT search features'
|
|
},
|
|
openai_chatgpt_user: {
|
|
name: 'OpenAI ChatGPT-User',
|
|
url: 'https://openai.com/chatgpt-user.json',
|
|
format: :json,
|
|
parser: :parse_openai_ranges,
|
|
description: 'OpenAI ChatGPT-User for user actions in ChatGPT and Custom GPTs'
|
|
},
|
|
openai_gptbot: {
|
|
name: 'OpenAI GPTBot',
|
|
url: 'https://openai.com/gptbot.json',
|
|
format: :json,
|
|
parser: :parse_openai_ranges,
|
|
description: 'OpenAI GPTBot for training AI foundation models'
|
|
},
|
|
cloudflare: {
|
|
name: 'Cloudflare',
|
|
urls: [
|
|
'https://www.cloudflare.com/ips-v4',
|
|
'https://www.cloudflare.com/ips-v6'
|
|
],
|
|
format: :text,
|
|
parser: :parse_cloudflare_ranges,
|
|
description: 'Cloudflare network ranges including their crawlers and services'
|
|
},
|
|
facebook: {
|
|
name: 'Facebook/Meta',
|
|
url: 'https://developers.facebook.com/docs/sharing/webmasters/crawler/',
|
|
format: :html,
|
|
parser: :parse_facebook_ranges,
|
|
description: 'Facebook/Meta crawlers and bots'
|
|
},
|
|
applebot: {
|
|
name: 'Applebot',
|
|
url: 'https://support.apple.com/en-us/HT204683',
|
|
format: :html,
|
|
parser: :parse_applebot_ranges,
|
|
description: 'Applebot crawler for Apple search and Siri'
|
|
},
|
|
duckduckgo: {
|
|
name: 'DuckDuckBot',
|
|
url: 'https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/',
|
|
format: :html,
|
|
parser: :parse_duckduckgo_ranges,
|
|
description: 'DuckDuckGo search crawler'
|
|
}
|
|
}.freeze
|
|
|
|
def self.import_from_source(source_key, options = {})
|
|
source = BOT_SOURCES[source_key.to_sym]
|
|
raise ImportError, "Unknown source: #{source_key}" unless source
|
|
|
|
puts "Importing bot network ranges from #{source[:name]}..."
|
|
|
|
case source[:parser]
|
|
when :parse_aws_ranges
|
|
parse_aws_ranges(source, options)
|
|
when :parse_google_ranges
|
|
parse_google_ranges(source, options)
|
|
when :parse_microsoft_ranges
|
|
parse_microsoft_ranges(source, options)
|
|
when :parse_anthropic_ranges
|
|
parse_anthropic_ranges(source, options)
|
|
when :parse_openai_ranges
|
|
parse_openai_ranges(source, options)
|
|
when :parse_cloudflare_ranges
|
|
parse_cloudflare_ranges(source, options)
|
|
when :parse_facebook_ranges
|
|
parse_facebook_ranges(source, options)
|
|
when :parse_applebot_ranges
|
|
parse_applebot_ranges(source, options)
|
|
when :parse_duckduckgo_ranges
|
|
parse_duckduckgo_ranges(source, options)
|
|
else
|
|
raise ImportError, "Unknown parser: #{source[:parser]}"
|
|
end
|
|
end
|
|
|
|
def self.import_all_sources(options = {})
|
|
results = {}
|
|
|
|
BOT_SOURCES.each do |source_key, source|
|
|
puts "\n" + "="*50
|
|
puts "Processing #{source[:name]}..."
|
|
puts "="*50
|
|
|
|
begin
|
|
results[source_key] = import_from_source(source_key, options)
|
|
rescue => e
|
|
Rails.logger.error "Failed to import from #{source[:name]}: #{e.message}"
|
|
results[source_key] = { error: e.message, imported: 0 }
|
|
end
|
|
end
|
|
|
|
puts "\n" + "="*50
|
|
puts "Import Summary"
|
|
puts "="*50
|
|
|
|
results.each do |source, result|
|
|
if result[:error]
|
|
puts "#{source}: FAILED - #{result[:error]}"
|
|
else
|
|
puts "#{source}: SUCCESS - #{result[:imported]} ranges imported"
|
|
end
|
|
end
|
|
|
|
results
|
|
end
|
|
|
|
private
|
|
|
|
# Amazon AWS IP ranges parser
|
|
def self.parse_aws_ranges(source, options = {})
|
|
require 'net/http'
|
|
require 'uri'
|
|
|
|
uri = URI.parse(source[:url])
|
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
http.use_ssl = true
|
|
http.read_timeout = 30
|
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE if uri.scheme == 'https'
|
|
|
|
response = http.get(uri.request_uri)
|
|
raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200'
|
|
|
|
data = JSON.parse(response.body)
|
|
imported_count = 0
|
|
batch_size = options[:batch_size] || 1000
|
|
batch = []
|
|
|
|
# Filter for relevant services (can be customized)
|
|
relevant_services = options[:aws_services] || ['AMAZON', 'ROUTE53', 'EC2', 'CLOUDFRONT']
|
|
|
|
data['prefixes'].each do |prefix|
|
|
# Focus on relevant services and regions
|
|
next unless relevant_services.include?(prefix['service'])
|
|
|
|
network_range = {
|
|
network: prefix['ip_prefix'],
|
|
source: 'bot_import_amazon_aws',
|
|
asn: nil, # AWS doesn't provide ASN in this feed
|
|
asn_org: 'Amazon Web Services',
|
|
company: 'Amazon',
|
|
country: nil,
|
|
is_datacenter: true,
|
|
is_proxy: false,
|
|
is_vpn: false,
|
|
additional_data: {
|
|
aws_service: prefix['service'],
|
|
aws_region: prefix['region'],
|
|
aws_network_border_group: prefix['network_border_group'],
|
|
import_date: Time.current.iso8601
|
|
}.to_json
|
|
}
|
|
|
|
batch << network_range
|
|
|
|
if batch.size >= batch_size
|
|
imported_count += import_batch(batch, 'Amazon AWS')
|
|
batch = []
|
|
puts "Imported #{imported_count} AWS ranges..."
|
|
end
|
|
end
|
|
|
|
# Import remaining records
|
|
if batch.any?
|
|
imported_count += import_batch(batch, 'Amazon AWS')
|
|
end
|
|
|
|
puts "Amazon AWS import completed: #{imported_count} ranges imported"
|
|
{ imported: imported_count, source: 'Amazon AWS' }
|
|
rescue Timeout::Error, Net::OpenTimeout => e
|
|
raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}"
|
|
rescue JSON::ParserError => e
|
|
raise ImportError, "Failed to parse AWS JSON response: #{e.message}"
|
|
end
|
|
|
|
# Google crawler IP ranges parser
|
|
def self.parse_google_ranges(source, options = {})
|
|
imported_count = 0
|
|
|
|
# Try each potential URL
|
|
urls = Array(source[:urls] || source[:url])
|
|
|
|
urls.each do |url|
|
|
begin
|
|
puts "Attempting to fetch Google ranges from: #{url}"
|
|
|
|
uri = URI.parse(url)
|
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
http.use_ssl = true
|
|
http.read_timeout = 30
|
|
|
|
response = http.get(uri.request_uri)
|
|
next unless response.code == '200'
|
|
|
|
data = JSON.parse(response.body)
|
|
|
|
batch_size = options[:batch_size] || 1000
|
|
batch = []
|
|
|
|
# Parse Google crawler format (varies by file type)
|
|
if data.is_a?(Array)
|
|
data.each do |entry|
|
|
next unless entry['cidr'] || entry['prefix']
|
|
|
|
network_range = {
|
|
network: entry['cidr'] || entry['prefix'],
|
|
source: 'bot_import_google',
|
|
asn: nil,
|
|
asn_org: 'Google LLC',
|
|
company: 'Google',
|
|
country: nil,
|
|
is_datacenter: true,
|
|
is_proxy: false,
|
|
is_vpn: false,
|
|
additional_data: {
|
|
crawler_type: entry['crawler_type'] || 'unknown',
|
|
user_agent: entry['user_agent'],
|
|
import_date: Time.current.iso8601
|
|
}.to_json
|
|
}
|
|
|
|
batch << network_range
|
|
|
|
if batch.size >= batch_size
|
|
imported_count += import_batch(batch, 'Google')
|
|
batch = []
|
|
puts "Imported #{imported_count} Google ranges..."
|
|
end
|
|
end
|
|
end
|
|
|
|
# Import remaining records
|
|
if batch.any?
|
|
imported_count += import_batch(batch, 'Google')
|
|
end
|
|
|
|
puts "Google import completed: #{imported_count} ranges imported"
|
|
return { imported: imported_count, source: 'Google' }
|
|
|
|
rescue => e
|
|
Rails.logger.warn "Failed to fetch from #{url}: #{e.message}"
|
|
next
|
|
end
|
|
end
|
|
|
|
raise ImportError, "Failed to fetch Google crawler ranges from any URL"
|
|
end
|
|
|
|
# Microsoft Bing crawler IP ranges parser
|
|
def self.parse_microsoft_ranges(source, options = {})
|
|
# Microsoft requires special handling as they may not provide direct JSON
|
|
# This is a placeholder implementation
|
|
|
|
puts "Microsoft Bing crawler import requires manual configuration or web scraping"
|
|
puts "Refer to: https://www.bing.com/webmaster/help/which-crawlers-does-bing-use"
|
|
|
|
{
|
|
imported: 0,
|
|
source: 'Microsoft Bing',
|
|
note: 'Manual configuration required - Microsoft does not provide direct IP range feeds'
|
|
}
|
|
end
|
|
|
|
# Anthropic service IP ranges parser
|
|
def self.parse_anthropic_ranges(source, options = {})
|
|
# Anthropic ranges may need to be manually configured
|
|
# This is a placeholder implementation
|
|
|
|
puts "Anthropic Claude service ranges require manual configuration"
|
|
puts "Refer to: https://docs.anthropic.com/claude/reference/ip_ranges"
|
|
|
|
{
|
|
imported: 0,
|
|
source: 'Anthropic',
|
|
note: 'Manual configuration required - Anthropic does not provide automated IP range feeds'
|
|
}
|
|
end
|
|
|
|
# OpenAI crawler IP ranges parser
|
|
def self.parse_openai_ranges(source, options = {})
|
|
require 'net/http'
|
|
require 'uri'
|
|
|
|
uri = URI.parse(source[:url])
|
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
http.use_ssl = true
|
|
http.read_timeout = 30
|
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE if uri.scheme == 'https'
|
|
|
|
response = http.get(uri.request_uri)
|
|
raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200'
|
|
|
|
data = JSON.parse(response.body)
|
|
imported_count = 0
|
|
batch_size = options[:batch_size] || 1000
|
|
batch = []
|
|
|
|
# Determine crawler type from source name
|
|
crawler_type = source[:name].gsub('OpenAI ', '').downcase
|
|
|
|
# Handle different OpenAI JSON formats
|
|
prefixes = data['prefixes'] || data
|
|
|
|
prefixes.each do |entry|
|
|
# OpenAI provides IP ranges as ipv4Prefix/ipv6Prefix or cidr/ip_prefix
|
|
ip_range = entry['ipv4Prefix'] || entry['ipv6Prefix'] || entry['cidr'] || entry['ip_prefix'] || entry['ip']
|
|
next unless ip_range
|
|
|
|
# Convert single IPs to /32 or /128
|
|
network = ip_range.include?('/') ? ip_range : "#{ip_range}/32"
|
|
|
|
network_range = {
|
|
network: network,
|
|
source: "bot_import_openai_#{crawler_type}",
|
|
asn: nil,
|
|
asn_org: 'OpenAI',
|
|
company: 'OpenAI',
|
|
country: nil,
|
|
is_datacenter: true,
|
|
is_proxy: false,
|
|
is_vpn: false,
|
|
additional_data: {
|
|
crawler_type: crawler_type,
|
|
crawler_purpose: crawler_purpose(crawler_type),
|
|
user_agent: openai_user_agent(crawler_type),
|
|
import_date: Time.current.iso8601,
|
|
source_url: source[:url]
|
|
}.to_json
|
|
}
|
|
|
|
batch << network_range
|
|
|
|
if batch.size >= batch_size
|
|
imported_count += import_batch(batch, "OpenAI #{crawler_type}")
|
|
batch = []
|
|
puts "Imported #{imported_count} OpenAI #{crawler_type} ranges..."
|
|
end
|
|
end
|
|
|
|
# Import remaining records
|
|
if batch.any?
|
|
imported_count += import_batch(batch, "OpenAI #{crawler_type}")
|
|
end
|
|
|
|
puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported"
|
|
{ imported: imported_count, source: "OpenAI #{crawler_type}" }
|
|
rescue Timeout::Error, Net::OpenTimeout => e
|
|
raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}"
|
|
rescue JSON::ParserError => e
|
|
raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}"
|
|
end
|
|
|
|
def self.import_batch(batch_data, source_name)
|
|
# Check for existing ranges to avoid duplicates
|
|
existing_networks = NetworkRange.where(network: batch_data.map { |d| d[:network] }).pluck(:network)
|
|
new_ranges = batch_data.reject { |d| existing_networks.include?(d[:network]) }
|
|
|
|
if new_ranges.any?
|
|
NetworkRange.insert_all(new_ranges)
|
|
puts "Imported #{new_ranges.size} new #{source_name} ranges (#{batch_data.size - new_ranges.size} duplicates skipped)"
|
|
else
|
|
puts "No new #{source_name} ranges to import (all duplicates)"
|
|
end
|
|
|
|
new_ranges.size
|
|
rescue => e
|
|
Rails.logger.error "Failed to import #{source_name} batch: #{e.message}"
|
|
|
|
# Fallback to individual imports
|
|
imported = 0
|
|
new_ranges.each do |data|
|
|
begin
|
|
NetworkRange.create!(data)
|
|
imported += 1
|
|
rescue => individual_error
|
|
Rails.logger.error "Failed to import individual #{source_name} record: #{individual_error.message}"
|
|
end
|
|
end
|
|
|
|
imported
|
|
end
|
|
|
|
# Helper method to determine crawler purpose based on type
|
|
def self.crawler_purpose(crawler_type)
|
|
case crawler_type
|
|
when 'searchbot'
|
|
'Used to link to and surface websites in search results in ChatGPT\'s search features'
|
|
when 'chatgpt-user'
|
|
'User actions in ChatGPT and Custom GPTs, including GPT Actions'
|
|
when 'gptbot'
|
|
'Used to crawl content for training OpenAI\'s generative AI foundation models'
|
|
else
|
|
'Unknown purpose'
|
|
end
|
|
end
|
|
|
|
# Helper method to get OpenAI user agent strings
|
|
def self.openai_user_agent(crawler_type)
|
|
case crawler_type
|
|
when 'searchbot'
|
|
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot'
|
|
when 'chatgpt-user'
|
|
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot'
|
|
when 'gptbot'
|
|
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot'
|
|
else
|
|
'Unknown user agent'
|
|
end
|
|
end
|
|
|
|
# Cloudflare IP ranges parser
|
|
def self.parse_cloudflare_ranges(source, options = {})
|
|
require 'net/http'
|
|
require 'uri'
|
|
|
|
imported_count = 0
|
|
urls = Array(source[:urls])
|
|
batch_size = options[:batch_size] || 1000
|
|
batch = []
|
|
|
|
urls.each do |url|
|
|
begin
|
|
puts "Fetching Cloudflare ranges from: #{url}"
|
|
|
|
uri = URI.parse(url)
|
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
http.use_ssl = true
|
|
http.read_timeout = 30
|
|
|
|
response = http.get(uri.request_uri)
|
|
raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200'
|
|
|
|
# Cloudflare provides plain text CIDR lists
|
|
# Handle both newline-separated and single-line formats
|
|
lines = response.body.include?("\n") ? response.body.split("\n") : response.body.split
|
|
ip_version = url.include?('ips-v4') ? 4 : 6
|
|
|
|
lines.each do |line|
|
|
line = line.strip
|
|
next if line.empty? || line.start_with?('#')
|
|
|
|
# Validate CIDR format
|
|
next unless line.match?(/\A[0-9a-fA-F:.]+\/\d+\z/)
|
|
|
|
network_range = {
|
|
network: line,
|
|
source: 'bot_import_cloudflare',
|
|
asn: nil,
|
|
asn_org: 'Cloudflare',
|
|
company: 'Cloudflare',
|
|
country: nil,
|
|
is_datacenter: true,
|
|
is_proxy: false,
|
|
is_vpn: false,
|
|
additional_data: {
|
|
ip_version: ip_version,
|
|
import_date: Time.current.iso8601,
|
|
source_url: url,
|
|
service_type: 'cdn_and_security'
|
|
}.to_json
|
|
}
|
|
|
|
batch << network_range
|
|
|
|
if batch.size >= batch_size
|
|
imported_count += import_batch(batch, 'Cloudflare')
|
|
batch = []
|
|
puts "Imported #{imported_count} Cloudflare ranges..."
|
|
end
|
|
end
|
|
|
|
rescue => e
|
|
Rails.logger.warn "Failed to fetch Cloudflare ranges from #{url}: #{e.message}"
|
|
next
|
|
end
|
|
end
|
|
|
|
# Import remaining records
|
|
if batch.any?
|
|
imported_count += import_batch(batch, 'Cloudflare')
|
|
end
|
|
|
|
puts "Cloudflare import completed: #{imported_count} ranges imported"
|
|
{ imported: imported_count, source: 'Cloudflare' }
|
|
end
|
|
|
|
# Facebook/Meta crawler ranges parser (placeholder)
|
|
def self.parse_facebook_ranges(source, options = {})
|
|
puts "Facebook/Meta crawler ranges require web scraping or manual configuration"
|
|
puts "Refer to: https://developers.facebook.com/docs/sharing/webmasters/crawler/"
|
|
|
|
{
|
|
imported: 0,
|
|
source: 'Facebook/Meta',
|
|
note: 'Manual configuration required - Facebook does not provide automated IP range feeds'
|
|
}
|
|
end
|
|
|
|
# Applebot crawler ranges parser (placeholder)
|
|
def self.parse_applebot_ranges(source, options = {})
|
|
puts "Applebot ranges require web scraping or manual configuration"
|
|
puts "Refer to: https://support.apple.com/en-us/HT204683"
|
|
|
|
{
|
|
imported: 0,
|
|
source: 'Applebot',
|
|
note: 'Manual configuration required - Apple does not provide automated IP range feeds'
|
|
}
|
|
end
|
|
|
|
# DuckDuckBot crawler ranges parser (placeholder)
|
|
def self.parse_duckduckgo_ranges(source, options = {})
|
|
puts "DuckDuckBot ranges require web scraping or manual configuration"
|
|
puts "Refer to: https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/"
|
|
|
|
{
|
|
imported: 0,
|
|
source: 'DuckDuckBot',
|
|
note: 'Manual configuration required - DuckDuckGo does not provide automated IP range feeds'
|
|
}
|
|
end
|
|
end |