baffle-hub/app/services/bot_network_range_importer.rb

# frozen_string_literal: true

# BotNetworkRangeImporter - Service for importing official bot network ranges
#
# Imports network ranges from official bot provider sources like:
# - Amazon AWS: https://ip-ranges.amazonaws.com/ip-ranges.json
# - Google: Official crawler IP lists
# - Microsoft/Bing: Bot network ranges
# - Anthropic: Service network ranges
# - OpenAI: Service network ranges
class BotNetworkRangeImporter
  class ImportError < StandardError; end

  # Official sources for bot network ranges
  BOT_SOURCES = {
    amazon_aws: {
      name: 'Amazon AWS',
      url: 'https://ip-ranges.amazonaws.com/ip-ranges.json',
      format: :json,
      parser: :parse_aws_ranges,
      description: 'Official AWS IP ranges including Amazonbot and other services'
    },
    google: {
      name: 'Google',
      # Note: These URLs may need to be updated based on current Google documentation
      urls: [
        'https://developers.google.com/search/docs/files/googlebot.json',
        'https://developers.google.com/search/docs/files/special-crawlers.json'
      ],
      format: :json,
      parser: :parse_google_ranges,
      description: 'Googlebot and other Google crawler IP ranges'
    },
    microsoft_bing: {
      name: 'Microsoft Bing',
      # Note: Microsoft may require web scraping or API access
      url: 'https://www.bing.com/toolbox/bingbot.json',
      format: :json,
      parser: :parse_microsoft_ranges,
      description: 'Bingbot and other Microsoft crawler IP ranges'
    },
    anthropic: {
      name: 'Anthropic Claude',
      # Note: Anthropic ranges may need manual updates or different approach
      url: 'https://docs.anthropic.com/claude/reference/ip_ranges',
      format: :html,
      parser: :parse_anthropic_ranges,
      description: 'Anthropic Claude API service IP ranges'
    },
    openai_searchbot: {
      name: 'OpenAI SearchBot',
      url: 'https://openai.com/searchbot.json',
      format: :json,
      parser: :parse_openai_ranges,
      description: 'OpenAI SearchBot for ChatGPT search features'
    },
    openai_chatgpt_user: {
      name: 'OpenAI ChatGPT-User',
      url: 'https://openai.com/chatgpt-user.json',
      format: :json,
      parser: :parse_openai_ranges,
      description: 'OpenAI ChatGPT-User for user actions in ChatGPT and Custom GPTs'
    },
    openai_gptbot: {
      name: 'OpenAI GPTBot',
      url: 'https://openai.com/gptbot.json',
      format: :json,
      parser: :parse_openai_ranges,
      description: 'OpenAI GPTBot for training AI foundation models'
    },
    cloudflare: {
      name: 'Cloudflare',
      urls: [
        'https://www.cloudflare.com/ips-v4',
        'https://www.cloudflare.com/ips-v6'
      ],
      format: :text,
      parser: :parse_cloudflare_ranges,
      description: 'Cloudflare network ranges including their crawlers and services'
    },
    facebook: {
      name: 'Facebook/Meta',
      url: 'https://developers.facebook.com/docs/sharing/webmasters/crawler/',
      format: :html,
      parser: :parse_facebook_ranges,
      description: 'Facebook/Meta crawlers and bots'
    },
    applebot: {
      name: 'Applebot',
      url: 'https://support.apple.com/en-us/HT204683',
      format: :html,
      parser: :parse_applebot_ranges,
      description: 'Applebot crawler for Apple search and Siri'
    },
    duckduckgo: {
      name: 'DuckDuckBot',
      url: 'https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/',
      format: :html,
      parser: :parse_duckduckgo_ranges,
      description: 'DuckDuckGo search crawler'
    }
  }.freeze

  def self.import_from_source(source_key, options = {})
    source = BOT_SOURCES[source_key.to_sym]
    raise ImportError, "Unknown source: #{source_key}" unless source

    puts "Importing bot network ranges from #{source[:name]}..."

    case source[:parser]
    when :parse_aws_ranges
      parse_aws_ranges(source, options)
    when :parse_google_ranges
      parse_google_ranges(source, options)
    when :parse_microsoft_ranges
      parse_microsoft_ranges(source, options)
    when :parse_anthropic_ranges
      parse_anthropic_ranges(source, options)
    when :parse_openai_ranges
      parse_openai_ranges(source, options)
    when :parse_cloudflare_ranges
      parse_cloudflare_ranges(source, options)
    when :parse_facebook_ranges
      parse_facebook_ranges(source, options)
    when :parse_applebot_ranges
      parse_applebot_ranges(source, options)
    when :parse_duckduckgo_ranges
      parse_duckduckgo_ranges(source, options)
    else
      raise ImportError, "Unknown parser: #{source[:parser]}"
    end
  end

  def self.import_all_sources(options = {})
    results = {}

    BOT_SOURCES.each do |source_key, source|
      puts "\n" + "="*50
      puts "Processing #{source[:name]}..."
      puts "="*50

      begin
        results[source_key] = import_from_source(source_key, options)
      rescue => e
        Rails.logger.error "Failed to import from #{source[:name]}: #{e.message}"
        results[source_key] = { error: e.message, imported: 0 }
      end
    end

    puts "\n" + "="*50
    puts "Import Summary"
    puts "="*50

    results.each do |source, result|
      if result[:error]
        puts "#{source}: FAILED - #{result[:error]}"
      else
        puts "#{source}: SUCCESS - #{result[:imported]} ranges imported"
      end
    end

    results
  end

  private

  # Amazon AWS IP ranges parser
  def self.parse_aws_ranges(source, options = {})
    require 'net/http'
    require 'uri'

    uri = URI.parse(source[:url])
    http = Net::HTTP.new(uri.host, uri.port)
    http.use_ssl = true
    http.read_timeout = 30

    response = http.get(uri.request_uri)
    raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200'

    data = JSON.parse(response.body)
    imported_count = 0
    batch_size = options[:batch_size] || 1000
    batch = []

    # Filter for relevant services (can be customized)
    relevant_services = options[:aws_services] || ['AMAZON', 'ROUTE53', 'EC2', 'CLOUDFRONT']

    data['prefixes'].each do |prefix|
      # Focus on relevant services and regions
      next unless relevant_services.include?(prefix['service'])

      network_range = {
        network: prefix['ip_prefix'],
        source: 'bot_import_amazon_aws',
        asn: nil, # AWS doesn't provide ASN in this feed
        asn_org: 'Amazon Web Services',
        company: 'Amazon',
        country: nil,
        is_datacenter: true,
        is_proxy: false,
        is_vpn: false,
        additional_data: {
          aws_service: prefix['service'],
          aws_region: prefix['region'],
          aws_network_border_group: prefix['network_border_group'],
          import_date: Time.current.iso8601
        }.to_json
      }

      batch << network_range

      if batch.size >= batch_size
        imported_count += import_batch(batch, 'Amazon AWS')
        batch = []
        puts "Imported #{imported_count} AWS ranges..."
      end
    end

    # Import remaining records
    if batch.any?
      imported_count += import_batch(batch, 'Amazon AWS')
    end

    puts "Amazon AWS import completed: #{imported_count} ranges imported"
    { imported: imported_count, source: 'Amazon AWS' }
  rescue Net::TimeoutError, Net::OpenTimeout => e
    raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}"
  rescue JSON::ParserError => e
    raise ImportError, "Failed to parse AWS JSON response: #{e.message}"
  end

  # Google crawler IP ranges parser
  def self.parse_google_ranges(source, options = {})
    imported_count = 0

    # Try each potential URL
    urls = Array(source[:urls] || source[:url])

    urls.each do |url|
      begin
        puts "Attempting to fetch Google ranges from: #{url}"

        uri = URI.parse(url)
        http = Net::HTTP.new(uri.host, uri.port)
        http.use_ssl = true
        http.read_timeout = 30

        response = http.get(uri.request_uri)
        next unless response.code == '200'

        data = JSON.parse(response.body)

        batch_size = options[:batch_size] || 1000
        batch = []

        # Parse Google crawler format (varies by file type)
        if data.is_a?(Array)
          data.each do |entry|
            next unless entry['cidr'] || entry['prefix']

            network_range = {
              network: entry['cidr'] || entry['prefix'],
              source: 'bot_import_google',
              asn: nil,
              asn_org: 'Google LLC',
              company: 'Google',
              country: nil,
              is_datacenter: true,
              is_proxy: false,
              is_vpn: false,
              additional_data: {
                crawler_type: entry['crawler_type'] || 'unknown',
                user_agent: entry['user_agent'],
                import_date: Time.current.iso8601
              }.to_json
            }

            batch << network_range

            if batch.size >= batch_size
              imported_count += import_batch(batch, 'Google')
              batch = []
              puts "Imported #{imported_count} Google ranges..."
            end
          end
        end

        # Import remaining records
        if batch.any?
          imported_count += import_batch(batch, 'Google')
        end

        puts "Google import completed: #{imported_count} ranges imported"
        return { imported: imported_count, source: 'Google' }

      rescue => e
        Rails.logger.warn "Failed to fetch from #{url}: #{e.message}"
        next
      end
    end

    raise ImportError, "Failed to fetch Google crawler ranges from any URL"
  end

  # Microsoft Bing crawler IP ranges parser
  def self.parse_microsoft_ranges(source, options = {})
    # Microsoft requires special handling as they may not provide direct JSON
    # This is a placeholder implementation

    puts "Microsoft Bing crawler import requires manual configuration or web scraping"
    puts "Refer to: https://www.bing.com/webmaster/help/which-crawlers-does-bing-use"

    {
      imported: 0,
      source: 'Microsoft Bing',
      note: 'Manual configuration required - Microsoft does not provide direct IP range feeds'
    }
  end

  # Anthropic service IP ranges parser
  def self.parse_anthropic_ranges(source, options = {})
    # Anthropic ranges may need to be manually configured
    # This is a placeholder implementation

    puts "Anthropic Claude service ranges require manual configuration"
    puts "Refer to: https://docs.anthropic.com/claude/reference/ip_ranges"

    {
      imported: 0,
      source: 'Anthropic',
      note: 'Manual configuration required - Anthropic does not provide automated IP range feeds'
    }
  end

  # OpenAI crawler IP ranges parser
  def self.parse_openai_ranges(source, options = {})
    require 'net/http'
    require 'uri'

    uri = URI.parse(source[:url])
    http = Net::HTTP.new(uri.host, uri.port)
    http.use_ssl = true
    http.read_timeout = 30

    response = http.get(uri.request_uri)
    raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200'

    data = JSON.parse(response.body)
    imported_count = 0
    batch_size = options[:batch_size] || 1000
    batch = []

    # Determine crawler type from source name
    crawler_type = source[:name].gsub('OpenAI ', '').downcase

    data.each do |entry|
      # OpenAI provides IP ranges as either CIDR notation or single IPs
      ip_range = entry['cidr'] || entry['ip_prefix'] || entry['ip']
      next unless ip_range

      # Convert single IPs to /32
      network = ip_range.include?('/') ? ip_range : "#{ip_range}/32"

      network_range = {
        network: network,
        source: "bot_import_openai_#{crawler_type}",
        asn: nil,
        asn_org: 'OpenAI',
        company: 'OpenAI',
        country: nil,
        is_datacenter: true,
        is_proxy: false,
        is_vpn: false,
        additional_data: {
          crawler_type: crawler_type,
          crawler_purpose: crawler_purpose(crawler_type),
          user_agent: openai_user_agent(crawler_type),
          import_date: Time.current.iso8601,
          source_url: source[:url]
        }.to_json
      }

      batch << network_range

      if batch.size >= batch_size
        imported_count += import_batch(batch, "OpenAI #{crawler_type}")
        batch = []
        puts "Imported #{imported_count} OpenAI #{crawler_type} ranges..."
      end
    end

    # Import remaining records
    if batch.any?
      imported_count += import_batch(batch, "OpenAI #{crawler_type}")
    end

    puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported"
    { imported: imported_count, source: "OpenAI #{crawler_type}" }
  rescue Net::TimeoutError, Net::OpenTimeout => e
    raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}"
  rescue JSON::ParserError => e
    raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}"
  end

  def self.import_batch(batch_data, source_name)
    # Check for existing ranges to avoid duplicates
    existing_networks = NetworkRange.where(network: batch_data.map { |d| d[:network] }).pluck(:network)
    new_ranges = batch_data.reject { |d| existing_networks.include?(d[:network]) }

    if new_ranges.any?
      NetworkRange.insert_all(new_ranges)
      puts "Imported #{new_ranges.size} new #{source_name} ranges (#{batch_data.size - new_ranges.size} duplicates skipped)"
    else
      puts "No new #{source_name} ranges to import (all duplicates)"
    end

    new_ranges.size
  rescue => e
    Rails.logger.error "Failed to import #{source_name} batch: #{e.message}"

    # Fallback to individual imports
    imported = 0
    new_ranges.each do |data|
      begin
        NetworkRange.create!(data)
        imported += 1
      rescue => individual_error
        Rails.logger.error "Failed to import individual #{source_name} record: #{individual_error.message}"
      end
    end

    imported
  end

  # Helper method to determine crawler purpose based on type
  def self.crawler_purpose(crawler_type)
    case crawler_type
    when 'searchbot'
      'Used to link to and surface websites in search results in ChatGPT\'s search features'
    when 'chatgpt-user'
      'User actions in ChatGPT and Custom GPTs, including GPT Actions'
    when 'gptbot'
      'Used to crawl content for training OpenAI\'s generative AI foundation models'
    else
      'Unknown purpose'
    end
  end

  # Helper method to get OpenAI user agent strings
  def self.openai_user_agent(crawler_type)
    case crawler_type
    when 'searchbot'
      'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot'
    when 'chatgpt-user'
      'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot'
    when 'gptbot'
      'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot'
    else
      'Unknown user agent'
    end
  end

  # Cloudflare IP ranges parser
  def self.parse_cloudflare_ranges(source, options = {})
    require 'net/http'
    require 'uri'

    imported_count = 0
    urls = Array(source[:urls])
    batch_size = options[:batch_size] || 1000
    batch = []

    urls.each do |url|
      begin
        puts "Fetching Cloudflare ranges from: #{url}"

        uri = URI.parse(url)
        http = Net::HTTP.new(uri.host, uri.port)
        http.use_ssl = true
        http.read_timeout = 30

        response = http.get(uri.request_uri)
        raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200'

        # Cloudflare provides plain text CIDR lists
        lines = response.body.split("\n")
        ip_version = url.include?('ips-v4') ? 4 : 6

        lines.each do |line|
          line = line.strip
          next if line.empty? || line.start_with?('#')

          # Validate CIDR format
          next unless line.match?(/\A[0-9a-fA-F:.]+\/\d+\z/)

          network_range = {
            network: line,
            source: 'bot_import_cloudflare',
            asn: nil,
            asn_org: 'Cloudflare',
            company: 'Cloudflare',
            country: nil,
            is_datacenter: true,
            is_proxy: false,
            is_vpn: false,
            additional_data: {
              ip_version: ip_version,
              import_date: Time.current.iso8601,
              source_url: url,
              service_type: 'cdn_and_security'
            }.to_json
          }

          batch << network_range

          if batch.size >= batch_size
            imported_count += import_batch(batch, 'Cloudflare')
            batch = []
            puts "Imported #{imported_count} Cloudflare ranges..."
          end
        end

      rescue => e
        Rails.logger.warn "Failed to fetch Cloudflare ranges from #{url}: #{e.message}"
        next
      end
    end

    # Import remaining records
    if batch.any?
      imported_count += import_batch(batch, 'Cloudflare')
    end

    puts "Cloudflare import completed: #{imported_count} ranges imported"
    { imported: imported_count, source: 'Cloudflare' }
  end

  # Facebook/Meta crawler ranges parser (placeholder)
  def self.parse_facebook_ranges(source, options = {})
    puts "Facebook/Meta crawler ranges require web scraping or manual configuration"
    puts "Refer to: https://developers.facebook.com/docs/sharing/webmasters/crawler/"

    {
      imported: 0,
      source: 'Facebook/Meta',
      note: 'Manual configuration required - Facebook does not provide automated IP range feeds'
    }
  end

  # Applebot crawler ranges parser (placeholder)
  def self.parse_applebot_ranges(source, options = {})
    puts "Applebot ranges require web scraping or manual configuration"
    puts "Refer to: https://support.apple.com/en-us/HT204683"

    {
      imported: 0,
      source: 'Applebot',
      note: 'Manual configuration required - Apple does not provide automated IP range feeds'
    }
  end

  # DuckDuckBot crawler ranges parser (placeholder)
  def self.parse_duckduckgo_ranges(source, options = {})
    puts "DuckDuckBot ranges require web scraping or manual configuration"
    puts "Refer to: https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/"

    {
      imported: 0,
      source: 'DuckDuckBot',
      note: 'Manual configuration required - DuckDuckGo does not provide automated IP range feeds'
    }
  end
end