diff --git a/app/controllers/analytics_controller.rb b/app/controllers/analytics_controller.rb index 8c31a13..9fc179f 100644 --- a/app/controllers/analytics_controller.rb +++ b/app/controllers/analytics_controller.rb @@ -23,9 +23,10 @@ class AnalyticsController < ApplicationController # Cache key includes period and start_time (hour-aligned for consistency) cache_key_base = "analytics/#{@time_period}/#{@start_time.to_i}" - # Core statistics - cached + # Core statistics - cached (uses DuckDB if available) @total_events = Rails.cache.fetch("#{cache_key_base}/total_events", expires_in: cache_ttl) do - Event.where("timestamp >= ?", @start_time).count + with_duckdb_fallback { EventDdb.count_since(@start_time) } || + Event.where("timestamp >= ?", @start_time).count end @total_rules = Rails.cache.fetch("analytics/total_rules", expires_in: 5.minutes) do @@ -40,31 +41,33 @@ class AnalyticsController < ApplicationController NetworkRange.count end - # Event breakdown by action - cached + # Event breakdown by action - cached (uses DuckDB if available) @event_breakdown = Rails.cache.fetch("#{cache_key_base}/event_breakdown", expires_in: cache_ttl) do - Event.where("timestamp >= ?", @start_time) - .group(:waf_action) - .count - # Keys are already strings ("allow", "deny", etc.) from the enum + with_duckdb_fallback { EventDdb.breakdown_by_action(@start_time) } || + Event.where("timestamp >= ?", @start_time) + .group(:waf_action) + .count end - # Top countries by event count - cached (now uses denormalized country column) + # Top countries by event count - cached (uses DuckDB if available) @top_countries = Rails.cache.fetch("#{cache_key_base}/top_countries", expires_in: cache_ttl) do - Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) - .group(:country) - .count - .sort_by { |_, count| -count } - .first(10) + with_duckdb_fallback { EventDdb.top_countries(@start_time, 10) } || + Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) + .group(:country) + .count + .sort_by { |_, count| -count } + .first(10) end - # Top blocked IPs - cached + # Top blocked IPs - cached (uses DuckDB if available) @top_blocked_ips = Rails.cache.fetch("#{cache_key_base}/top_blocked_ips", expires_in: cache_ttl) do - Event.where("timestamp >= ?", @start_time) - .where(waf_action: 1) # deny action in enum - .group(:ip_address) - .count - .sort_by { |_, count| -count } - .first(10) + with_duckdb_fallback { EventDdb.top_blocked_ips(@start_time, 10) } || + Event.where("timestamp >= ?", @start_time) + .where(waf_action: 0) # deny action in enum + .group(:ip_address) + .count + .sort_by { |_, count| -count } + .first(10) end # Network range intelligence breakdown - cached @@ -92,7 +95,7 @@ class AnalyticsController < ApplicationController total_users: User.count, active_rules: Rule.enabled.count, disabled_rules: Rule.where(enabled: false).count, - recent_errors: Event.where("timestamp >= ? AND waf_action = ?", @start_time, 1).count # 1 = deny + recent_errors: Event.where("timestamp >= ? AND waf_action = ?", @start_time, 0).count # 0 = deny } end @@ -117,38 +120,90 @@ class AnalyticsController < ApplicationController @time_period = params[:period]&.to_sym || :day @start_time = calculate_start_time(@time_period) - # Top networks by request volume (using denormalized network_range_id) - # Use a subquery approach to avoid PostgreSQL GROUP BY issues with network_ranges.* - event_stats = Event.where("timestamp >= ?", @start_time) - .where.not(network_range_id: nil) - .group(:network_range_id) - .select("network_range_id, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips") + # Top networks by request volume - use DuckDB if available + network_stats = with_duckdb_fallback { EventDdb.top_networks(@start_time, 50) } - # Join the stats back to NetworkRange to get full network details - @top_networks = NetworkRange.joins("INNER JOIN (#{event_stats.to_sql}) stats ON stats.network_range_id = network_ranges.id") - .select("network_ranges.*, stats.event_count, stats.unique_ips") - .order("stats.event_count DESC") - .limit(50) + if network_stats + # DuckDB path: array format [network_range_id, event_count, unique_ips] + network_ids = network_stats.map { |row| row[0] } + stats_by_id = network_stats.to_h { |row| [row[0], { event_count: row[1], unique_ips: row[2] }] } + + @top_networks = NetworkRange.where(id: network_ids) + .to_a + .map do |network| + stats = stats_by_id[network.id] + network.define_singleton_method(:event_count) { stats[:event_count] } + network.define_singleton_method(:unique_ips) { stats[:unique_ips] } + + # Add inherited intelligence support + intelligence = network.inherited_intelligence + if intelligence[:inherited] + network.define_singleton_method(:display_company) { intelligence[:company] } + network.define_singleton_method(:display_country) { intelligence[:country] } + network.define_singleton_method(:inherited_from) { intelligence[:parent_cidr] } + network.define_singleton_method(:has_inherited_data?) { true } + else + network.define_singleton_method(:display_company) { network.company } + network.define_singleton_method(:display_country) { network.country } + network.define_singleton_method(:inherited_from) { nil } + network.define_singleton_method(:has_inherited_data?) { false } + end + + network + end + .sort_by { |n| -n.event_count } + else + # PostgreSQL fallback + event_stats = Event.where("timestamp >= ?", @start_time) + .where.not(network_range_id: nil) + .group(:network_range_id) + .select("network_range_id, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips") + + @top_networks = NetworkRange.joins("INNER JOIN (#{event_stats.to_sql}) stats ON stats.network_range_id = network_ranges.id") + .select("network_ranges.*, stats.event_count, stats.unique_ips") + .order("stats.event_count DESC") + .limit(50) + + # Add inherited intelligence support for PostgreSQL fallback + @top_networks = @top_networks.to_a.map do |network| + intelligence = network.inherited_intelligence + if intelligence[:inherited] + network.define_singleton_method(:display_company) { intelligence[:company] } + network.define_singleton_method(:display_country) { intelligence[:country] } + network.define_singleton_method(:inherited_from) { intelligence[:parent_cidr] } + network.define_singleton_method(:has_inherited_data?) { true } + else + network.define_singleton_method(:display_company) { network.company } + network.define_singleton_method(:display_country) { network.country } + network.define_singleton_method(:inherited_from) { nil } + network.define_singleton_method(:has_inherited_data?) { false } + end + network + end + end # Network type breakdown with traffic stats @network_breakdown = calculate_network_type_stats(@start_time) - # Company breakdown for top traffic sources (using denormalized company column) - @top_companies = Event.where("timestamp >= ? AND company IS NOT NULL", @start_time) + # Company breakdown for top traffic sources - use DuckDB if available + @top_companies = with_duckdb_fallback { EventDdb.top_companies(@start_time, 20) } || + Event.where("timestamp >= ? AND company IS NOT NULL", @start_time) .group(:company) .select("company, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips, COUNT(DISTINCT network_range_id) as network_count") .order("event_count DESC") .limit(20) - # ASN breakdown (using denormalized asn columns) - @top_asns = Event.where("timestamp >= ? AND asn IS NOT NULL", @start_time) + # ASN breakdown - use DuckDB if available + @top_asns = with_duckdb_fallback { EventDdb.top_asns(@start_time, 15) } || + Event.where("timestamp >= ? AND asn IS NOT NULL", @start_time) .group(:asn, :asn_org) .select("asn, asn_org, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips, COUNT(DISTINCT network_range_id) as network_count") .order("event_count DESC") .limit(15) - # Geographic breakdown (using denormalized country column) - @top_countries = Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) + # Geographic breakdown - use DuckDB if available + @top_countries = with_duckdb_fallback { EventDdb.top_countries_with_stats(@start_time, 15) } || + Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) .group(:country) .select("country, COUNT(*) as event_count, COUNT(DISTINCT ip_address) as unique_ips") .order("event_count DESC") @@ -191,12 +246,15 @@ class AnalyticsController < ApplicationController # Historical hours are cached for full TTL, current hour cached briefly for freshness # Cache historical hours (1-23 hours ago) - these are complete and won't change - # No expiration - will stick around until evicted by cache store + # No expiration - will stick around until evicted by cache store (uses DuckDB if available) historical_timeline = Rails.cache.fetch("#{cache_key_base}/chart_historical") do historical_start = 23.hours.ago.beginning_of_hour - events_by_hour = Event.where("timestamp >= ? AND timestamp < ?", historical_start, Time.current.beginning_of_hour) - .group("DATE_TRUNC('hour', timestamp)") - .count + current_hour_start = Time.current.beginning_of_hour + + events_by_hour = with_duckdb_fallback { EventDdb.hourly_timeline(historical_start, current_hour_start) } || + Event.where("timestamp >= ? AND timestamp < ?", historical_start, current_hour_start) + .group("DATE_TRUNC('hour', timestamp)") + .count (1..23).map do |hour_ago| hour_time = hour_ago.hours.ago.beginning_of_hour @@ -209,6 +267,7 @@ class AnalyticsController < ApplicationController end # Current hour (0 hours ago) - cache very briefly since it's actively accumulating + # ALWAYS use PostgreSQL for current hour to get real-time data (DuckDB syncs every minute) current_hour_data = Rails.cache.fetch("#{cache_key_base}/chart_current_hour", expires_in: 1.minute) do hour_time = Time.current.beginning_of_hour count = Event.where("timestamp >= ?", hour_time).count @@ -290,6 +349,12 @@ class AnalyticsController < ApplicationController end def calculate_network_type_stats(start_time) + # Try DuckDB first, fallback to PostgreSQL + duckdb_stats = with_duckdb_fallback { EventDdb.network_type_stats(start_time) } + + return duckdb_stats if duckdb_stats + + # PostgreSQL fallback # Get all network types with their traffic statistics using denormalized columns network_types = [ { type: 'datacenter', label: 'Datacenter', column: :is_datacenter }, @@ -333,6 +398,12 @@ class AnalyticsController < ApplicationController end def calculate_suspicious_patterns(start_time) + # Try DuckDB first, fallback to PostgreSQL + duckdb_patterns = with_duckdb_fallback { EventDdb.suspicious_patterns(start_time) } + + return duckdb_patterns if duckdb_patterns + + # PostgreSQL fallback patterns = {} # High volume networks (top 1% by request count) - using denormalized network_range_id @@ -358,9 +429,9 @@ class AnalyticsController < ApplicationController high_deny_networks = Event.where("timestamp >= ? AND network_range_id IS NOT NULL", start_time) .group(:network_range_id) .select("network_range_id, - COUNT(CASE WHEN waf_action = 1 THEN 1 END) as denied_count, + COUNT(CASE WHEN waf_action = 0 THEN 1 END) as denied_count, COUNT(*) as total_count") - .having("COUNT(CASE WHEN waf_action = 1 THEN 1 END)::float / COUNT(*) > 0.5") + .having("COUNT(CASE WHEN waf_action = 0 THEN 1 END)::float / COUNT(*) > 0.5") .having("COUNT(*) >= 10") # minimum threshold patterns[:high_deny_rate] = { @@ -392,12 +463,14 @@ class AnalyticsController < ApplicationController { id: network.id, cidr: network.cidr, - company: network.company, + company: network.display_company, asn: network.asn, - country: network.country, + country: network.display_country, network_type: network.network_type, event_count: network.event_count, - unique_ips: network.unique_ips + unique_ips: network.unique_ips, + has_inherited_data: network.has_inherited_data?, + inherited_from: network.inherited_from } }, network_breakdown: @network_breakdown, @@ -449,4 +522,27 @@ class AnalyticsController < ApplicationController } end end + + # Helper method to try DuckDB first, fall back to PostgreSQL + def with_duckdb_fallback(&block) + result = yield + result.nil? ? nil : result # Return result or nil to trigger fallback + rescue StandardError => e + Rails.logger.warn "[Analytics] DuckDB query failed, falling back to PostgreSQL: #{e.message}" + nil # Return nil to trigger fallback + end + + # Check if DuckDB has recent data (within last 2 minutes) + # Returns true if DuckDB is up-to-date, false if potentially stale + def duckdb_is_fresh? + newest = AnalyticsDuckdbService.instance.newest_event_timestamp + return false if newest.nil? + + # Consider fresh if newest event is within 2 minutes + # (sync job runs every 1 minute, so 2 minutes allows for some lag) + newest >= 2.minutes.ago + rescue StandardError => e + Rails.logger.warn "[Analytics] Error checking DuckDB freshness: #{e.message}" + false + end end \ No newline at end of file diff --git a/app/controllers/bot_network_ranges_controller.rb b/app/controllers/bot_network_ranges_controller.rb new file mode 100644 index 0000000..d248eea --- /dev/null +++ b/app/controllers/bot_network_ranges_controller.rb @@ -0,0 +1,126 @@ +# frozen_string_literal: true + +class BotNetworkRangesController < ApplicationController + before_action :authenticate_user! + before_action :require_admin + + def index + @bot_sources = BotNetworkRangeImporter::BOT_SOURCES + @recent_imports = DataImport.where(import_type: 'bot_network_ranges').order(created_at: :desc).limit(10) + @bot_network_ranges = NetworkRange.where("source LIKE 'bot_import_%'").order(created_at: :desc).limit(50) + end + + def import + source_key = params[:source] + options = import_options + + if source_key.present? + # Perform import synchronously for immediate feedback + begin + result = BotNetworkRangeImporter.import_from_source(source_key, options) + + # Create a data import record + DataImport.create!( + import_type: 'bot_network_ranges', + source: source_key.to_s, + status: 'completed', + records_processed: result[:imported], + notes: "Imported from #{result[:source]}: #{result[:note] || 'Success'}" + ) + + flash[:notice] = "Successfully imported #{result[:imported]} ranges from #{result[:source]}" + rescue => e + flash[:alert] = "Failed to import from #{source_key}: #{e.message}" + end + else + flash[:alert] = "Please select a source to import from" + end + + redirect_to bot_network_ranges_path + end + + def import_async + source_key = params[:source] + options = import_options + + if source_key.present? + # Create a data import record for tracking + data_import = DataImport.create!( + import_type: 'bot_network_ranges', + source: source_key.to_s, + status: 'pending', + records_processed: 0, + notes: "Import job queued for #{source_key}" + ) + + # Queue the background job + ImportBotNetworkRangesJob.perform_later(source_key, options.merge(data_import_id: data_import.id)) + + flash[:notice] = "Import job queued for #{source_key}. You'll be notified when it's complete." + else + flash[:alert] = "Please select a source to import from" + end + + redirect_to bot_network_ranges_path + end + + def import_all + options = import_options + + # Create a data import record for batch import + data_import = DataImport.create!( + import_type: 'bot_network_ranges', + source: 'all_sources', + status: 'pending', + records_processed: 0, + notes: "Batch import job queued for all available sources" + ) + + # Queue the batch import job + ImportAllBotNetworkRangesJob.perform_later(options.merge(data_import_id: data_import.id)) + + flash[:notice] = "Batch import job queued for all sources. This may take several minutes." + redirect_to bot_network_ranges_path + end + + def show + @network_ranges = NetworkRange.where("source LIKE 'bot_import_#{params[:source]}%'") + .order(created_at: :desc) + .page(params[:page]) + .per(50) + + @source_name = BotNetworkRangeImporter::BOT_SOURCES[params[:source].to_sym]&.dig(:name) || params[:source] + @import_stats = NetworkRange.where("source LIKE 'bot_import_#{params[:source]}%'") + .group(:source) + .count + end + + def destroy + source = params[:source] + deleted_count = NetworkRange.where("source LIKE 'bot_import_#{source}%'").delete_all + + flash[:notice] = "Deleted #{deleted_count} network ranges from #{source}" + redirect_to bot_network_ranges_path + end + + private + + def require_admin + redirect_to root_path, alert: 'Admin access required' unless current_user&.admin? + end + + def import_options + options = {} + + # AWS-specific options + if params[:aws_services].present? + options[:aws_services] = params[:aws_services].split(',').map(&:strip) + end + + # Batch size control + options[:batch_size] = params[:batch_size].to_i if params[:batch_size].present? + options[:batch_size] = 1000 if options[:batch_size].zero? + + options + end +end \ No newline at end of file diff --git a/app/controllers/network_ranges_controller.rb b/app/controllers/network_ranges_controller.rb index 552e839..5dc13a1 100644 --- a/app/controllers/network_ranges_controller.rb +++ b/app/controllers/network_ranges_controller.rb @@ -46,8 +46,10 @@ class NetworkRangesController < ApplicationController authorize @network_range if @network_range.persisted? - # Real network - use direct IP containment for consistency with stats - events_scope = Event.where("ip_address <<= ?", @network_range.cidr).recent + # Real network - use indexed network_range_id for much better performance + # Include child network ranges to capture all traffic within this network block + network_ids = [@network_range.id] + @network_range.child_ranges.pluck(:id) + events_scope = Event.where(network_range_id: network_ids).recent else # Virtual network - find events by IP range containment events_scope = Event.where("ip_address <<= ?::inet", @network_range.to_s).recent @@ -58,22 +60,24 @@ class NetworkRangesController < ApplicationController @child_ranges = @network_range.child_ranges.limit(20) @parent_ranges = @network_range.parent_ranges.limit(10) - @associated_rules = @network_range.persisted? ? @network_range.rules.includes(:user).order(created_at: :desc) : [] + @associated_rules = @network_range.persisted? ? @network_range.rules.includes(:user, :network_range, :waf_policy).order(created_at: :desc) : [] # Load rules from supernets and subnets - @supernet_rules = @network_range.persisted? ? @network_range.supernet_rules.includes(:network_range, :user).limit(10) : [] - @subnet_rules = @network_range.persisted? ? @network_range.child_rules.includes(:network_range, :user).limit(20) : [] + @supernet_rules = @network_range.persisted? ? @network_range.supernet_rules.includes(:network_range, :user, :waf_policy).limit(10) : [] + @subnet_rules = @network_range.persisted? ? @network_range.child_rules.includes(:network_range, :user, :waf_policy).limit(20) : [] # Traffic analytics (if we have events) @traffic_stats = calculate_traffic_stats(@network_range) - # Check if we have IPAPI data (or if parent has it) + # Check if we have IPAPI data (or if parent has it) - cache expensive parent lookup @has_ipapi_data = @network_range.has_network_data_from?(:ipapi) @parent_with_ipapi = nil unless @has_ipapi_data - # Check if parent has IPAPI data - parent = @network_range.parent_with_intelligence + # Cache expensive parent intelligence lookup + parent = Rails.cache.fetch("network_parent_intel:#{@network_range.cache_key}", expires_in: 1.hour) do + @network_range.parent_with_intelligence + end if parent&.has_network_data_from?(:ipapi) @parent_with_ipapi = parent @has_ipapi_data = true @@ -194,6 +198,15 @@ class NetworkRangesController < ApplicationController private + # Helper method to try DuckDB first, fall back to PostgreSQL + def with_duckdb_fallback(&block) + result = yield + result.nil? ? nil : result # Return result or nil to trigger fallback + rescue StandardError => e + Rails.logger.warn "[NetworkRanges] DuckDB query failed, falling back to PostgreSQL: #{e.message}" + nil # Return nil to trigger fallback + end + def set_network_range # Handle CIDR slugs (e.g., "40.77.167.100_32" -> "40.77.167.100/32") cidr = params[:id].gsub('_', '/') @@ -248,27 +261,37 @@ class NetworkRangesController < ApplicationController # Use indexed network_range_id for much better performance instead of expensive CIDR operator # Include child network ranges to capture all traffic within this network block network_ids = [network_range.id] + network_range.child_ranges.pluck(:id) - base_query = Event.where(network_range_id: network_ids) - # Use separate queries: one for grouping (without ordering), one for recent activity (with ordering) - events_for_grouping = base_query.limit(1000) - events_for_activity = base_query.recent.limit(20) + # Try DuckDB first for stats (much faster) + duckdb_stats = with_duckdb_fallback { EventDdb.network_traffic_stats(network_ids) } + duckdb_top_paths = with_duckdb_fallback { EventDdb.network_top_paths(network_ids, 10) } + duckdb_top_agents = with_duckdb_fallback { EventDdb.network_top_user_agents(network_ids, 5) } - # Calculate counts properly - use consistent base_query for all counts - total_requests = base_query.count - unique_ips = base_query.except(:order).distinct.count(:ip_address) - blocked_requests = base_query.blocked.count - allowed_requests = base_query.allowed.count + if duckdb_stats + # DuckDB success - use fast aggregated stats + stats = duckdb_stats.merge( + top_paths: duckdb_top_paths&.to_h || {}, + top_user_agents: duckdb_top_agents&.to_h || {}, + recent_activity: Event.where(network_range_id: network_ids).recent.limit(20) + ) + else + # PostgreSQL fallback + base_query = Event.where(network_range_id: network_ids) + events_for_grouping = base_query.limit(1000) + events_for_activity = base_query.recent.limit(20) - { - total_requests: total_requests, - unique_ips: unique_ips, - blocked_requests: blocked_requests, - allowed_requests: allowed_requests, - top_paths: events_for_grouping.group(:request_path).count.sort_by { |_, count| -count }.first(10), - top_user_agents: events_for_grouping.group(:user_agent).count.sort_by { |_, count| -count }.first(5), - recent_activity: events_for_activity - } + stats = { + total_requests: base_query.count, + unique_ips: base_query.except(:order).distinct.count(:ip_address), + blocked_requests: base_query.blocked.count, + allowed_requests: base_query.allowed.count, + top_paths: events_for_grouping.group(:request_path).count.sort_by { |_, count| -count }.first(10).to_h, + top_user_agents: events_for_grouping.group(:user_agent).count.sort_by { |_, count| -count }.first(5).to_h, + recent_activity: events_for_activity + } + end + + stats else # No events - return empty stats { @@ -296,8 +319,8 @@ class NetworkRangesController < ApplicationController unique_ips: base_query.except(:order).distinct.count(:ip_address), blocked_requests: base_query.blocked.count, allowed_requests: base_query.allowed.count, - top_paths: events_for_grouping.group(:request_path).count.sort_by { |_, count| -count }.first(10), - top_user_agents: events_for_grouping.group(:user_agent).count.sort_by { |_, count| -count }.first(5), + top_paths: events_for_grouping.group(:request_path).count.sort_by { |_, count| -count }.first(10).to_h, + top_user_agents: events_for_grouping.group(:user_agent).count.sort_by { |_, count| -count }.first(5).to_h, recent_activity: events_for_activity } else diff --git a/app/controllers/rules_controller.rb b/app/controllers/rules_controller.rb index 4014b45..277fa21 100644 --- a/app/controllers/rules_controller.rb +++ b/app/controllers/rules_controller.rb @@ -46,12 +46,9 @@ class RulesController < ApplicationController process_quick_create_parameters # Handle network range creation if CIDR is provided - if params[:cidr].present? && @rule.network_rule? - network_range = NetworkRange.find_or_create_by(cidr: params[:cidr]) do |range| - range.user = Current.user - range.source = 'manual' - range.creation_reason = "Created for rule ##{@rule.id}" - end + cidr_param = params[:new_cidr].presence || params[:cidr].presence + if cidr_param.present? && @rule.network_rule? + network_range = NetworkRange.find_or_create_by_cidr(cidr_param, user: Current.user, source: 'manual') @rule.network_range = network_range end @@ -132,7 +129,9 @@ class RulesController < ApplicationController :expires_at, :enabled, :source, - :network_range_id + :network_range_id, + :header_name, + :header_value ] # Only include conditions for non-network rules @@ -250,15 +249,24 @@ def process_quick_create_parameters }) end - # Parse metadata if it's a string that looks like JSON - if @rule.metadata.is_a?(String) && @rule.metadata.starts_with?('{') + # Parse metadata textarea first if it's JSON + if @rule.metadata.is_a?(String) && @rule.metadata.present? && @rule.metadata.starts_with?('{') begin @rule.metadata = JSON.parse(@rule.metadata) rescue JSON::ParserError - # Keep as string if not valid JSON + # Keep as string if not valid JSON - will be caught by validation end end + # Ensure metadata is a hash + @rule.metadata = {} unless @rule.metadata.is_a?(Hash) + + # Handle add_header fields - use provided params or existing metadata values + if @rule.add_header_action? && (params[:header_name].present? || params[:header_value].present?) + @rule.metadata['header_name'] = params[:header_name].presence || @rule.metadata['header_name'] || 'X-Bot-Agent' + @rule.metadata['header_value'] = params[:header_value].presence || @rule.metadata['header_value'] || 'Unknown' + end + # Handle expires_at parsing for text input if params.dig(:rule, :expires_at).present? expires_at_str = params[:rule][:expires_at].strip diff --git a/app/javascript/controllers/rule_form_controller.js b/app/javascript/controllers/rule_form_controller.js new file mode 100644 index 0000000..0f4e7be --- /dev/null +++ b/app/javascript/controllers/rule_form_controller.js @@ -0,0 +1,36 @@ +import { Controller } from "@hotwired/stimulus" + +export default class RuleFormController extends Controller { + static targets = ["actionSelect", "addHeaderSection", "expirationCheckbox", "expirationField"] + + connect() { + this.updateActionSections() + } + + updateActionSections() { + const selectedAction = this.actionSelectTarget.value + + // Hide all action-specific sections + this.addHeaderSectionTarget.classList.add('hidden') + + // Show relevant section based on action + switch(selectedAction) { + case 'add_header': + this.addHeaderSectionTarget.classList.remove('hidden') + break + } + } + + toggleExpiration() { + if (this.expirationCheckboxTarget.checked) { + this.expirationFieldTarget.classList.remove('hidden') + } else { + this.expirationFieldTarget.classList.add('hidden') + // Clear the datetime field when unchecked + const datetimeInput = this.expirationFieldTarget.querySelector('input[type="datetime-local"]') + if (datetimeInput) { + datetimeInput.value = '' + } + } + } +} diff --git a/app/jobs/cleanup_old_events_job.rb b/app/jobs/cleanup_old_events_job.rb new file mode 100644 index 0000000..51f0362 --- /dev/null +++ b/app/jobs/cleanup_old_events_job.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +# CleanupOldEventsJob - Removes events older than the configured retention period +# +# This job runs periodically (hourly) to clean up old events based on the +# event_retention_days setting. This helps keep the database size manageable +# and improves query performance. +# +# The retention period is configurable via the 'event_retention_days' setting +# (default: 90 days). This allows administrators to balance between historical +# data retention and database performance. +# +# Schedule: Every hour (configured in config/recurring.yml) +class CleanupOldEventsJob < ApplicationJob + queue_as :background + + def perform + retention_days = Setting.event_retention_days + + # Don't delete if retention is set to 0 or negative (disabled) + if retention_days <= 0 + Rails.logger.info "CleanupOldEventsJob: Event retention disabled (retention_days: #{retention_days})" + return 0 + end + + cutoff_date = retention_days.days.ago + + # Count events to be deleted + old_events = Event.where('timestamp < ?', cutoff_date) + count = old_events.count + + if count.zero? + Rails.logger.info "CleanupOldEventsJob: No events older than #{retention_days} days found" + return 0 + end + + Rails.logger.info "CleanupOldEventsJob: Deleting #{count} events older than #{retention_days} days (before #{cutoff_date})" + + # Delete in batches to avoid long-running transactions + deleted_count = 0 + batch_size = 10_000 + + old_events.in_batches(of: batch_size) do |batch| + batch_count = batch.delete_all + deleted_count += batch_count + Rails.logger.info "CleanupOldEventsJob: Deleted batch of #{batch_count} events (total: #{deleted_count}/#{count})" + end + + Rails.logger.info "CleanupOldEventsJob: Successfully deleted #{deleted_count} events" + deleted_count + end +end diff --git a/app/jobs/fetch_ipapi_data_job.rb b/app/jobs/fetch_ipapi_data_job.rb index 8adeeef..477c4e8 100644 --- a/app/jobs/fetch_ipapi_data_job.rb +++ b/app/jobs/fetch_ipapi_data_job.rb @@ -15,31 +15,14 @@ class FetchIpapiDataJob < ApplicationJob ipapi_data = Ipapi.lookup(sample_ip) if ipapi_data.present? && !ipapi_data.key?('error') - # Check if IPAPI returned a different route than our tracking network - ipapi_route = ipapi_data.dig('asn', 'route') - target_network = tracking_network + # Process IPAPI data and create network ranges + result = Ipapi.process_ipapi_data(ipapi_data, tracking_network) - if ipapi_route.present? && ipapi_route != tracking_network.cidr - # IPAPI returned a different CIDR - find or create that network range - Rails.logger.info "IPAPI returned different route: #{ipapi_route} (requested: #{tracking_network.cidr})" + # Mark the tracking network as having been queried + # Use the broadest CIDR returned for deduplication + tracking_network.mark_ipapi_queried!(result[:broadest_cidr]) - target_network = NetworkRange.find_or_create_by(network: ipapi_route) do |nr| - nr.source = 'api_imported' - nr.creation_reason = "Created from IPAPI lookup for #{tracking_network.cidr}" - end - - Rails.logger.info "Storing IPAPI data on correct network: #{target_network.cidr}" - end - - # Store data on the target network (wherever IPAPI said it belongs) - target_network.set_network_data(:ipapi, ipapi_data) - target_network.last_api_fetch = Time.current - target_network.save! - - # Mark the tracking network as having been queried, with the CIDR that was returned - tracking_network.mark_ipapi_queried!(target_network.cidr) - - Rails.logger.info "Successfully fetched IPAPI data for #{tracking_network.cidr} (stored on #{target_network.cidr})" + Rails.logger.info "Successfully fetched IPAPI data for #{tracking_network.cidr} (created #{result[:networks].length} networks)" # Broadcast to the tracking network broadcast_ipapi_update(tracking_network, ipapi_data) diff --git a/app/jobs/import_all_bot_network_ranges_job.rb b/app/jobs/import_all_bot_network_ranges_job.rb new file mode 100644 index 0000000..3e84425 --- /dev/null +++ b/app/jobs/import_all_bot_network_ranges_job.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +# ImportAllBotNetworkRangesJob - Background job for importing from all bot sources +class ImportAllBotNetworkRangesJob < ApplicationJob + queue_as :default + + def perform(options = {}) + Rails.logger.info "Starting batch import of all bot network ranges" + + results = BotNetworkRangeImporter.import_all_sources(options) + + # Send completion summary + Rails.logger.info "Batch import completed. Summary: #{results}" + + # Broadcast summary to clients + ActionCable.server.broadcast( + "bot_imports", + { + type: 'batch_summary', + status: 'completed', + results: results, + message: "Batch import completed for all sources" + } + ) + end +end \ No newline at end of file diff --git a/app/jobs/import_bot_network_ranges_job.rb b/app/jobs/import_bot_network_ranges_job.rb new file mode 100644 index 0000000..217051e --- /dev/null +++ b/app/jobs/import_bot_network_ranges_job.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +# ImportBotNetworkRangesJob - Background job for importing bot network ranges +# +# Imports network ranges from official bot provider sources. +# Runs asynchronously to avoid blocking the web interface. +class ImportBotNetworkRangesJob < ApplicationJob + queue_as :default + + def perform(source_key, options = {}) + Rails.logger.info "Starting bot network range import for source: #{source_key}" + + begin + result = BotNetworkRangeImporter.import_from_source(source_key, options) + + # Send notification or log completion + Rails.logger.info "Successfully imported #{result[:imported]} ranges from #{result[:source]}" + + # Optionally broadcast via Turbo Streams for real-time updates + ActionCable.server.broadcast( + "bot_imports", + { + source: source_key, + status: 'completed', + imported: result[:imported], + message: "Successfully imported #{result[:imported]} ranges from #{result[:source]}" + } + ) + + rescue => e + Rails.logger.error "Bot network range import failed for #{source_key}: #{e.message}" + + # Broadcast error notification + ActionCable.server.broadcast( + "bot_imports", + { + source: source_key, + status: 'error', + error: e.message, + message: "Failed to import from #{source_key}: #{e.message}" + } + ) + + raise e + end + end +end \ No newline at end of file diff --git a/app/jobs/process_waf_event_job.rb b/app/jobs/process_waf_event_job.rb index dac56a3..d502449 100644 --- a/app/jobs/process_waf_event_job.rb +++ b/app/jobs/process_waf_event_job.rb @@ -53,16 +53,15 @@ class ProcessWafEventJob < ApplicationJob # Queue IPAPI enrichment based on /24 tracking # The tracking network is the /24 that stores ipapi_queried_at if NetworkRange.should_fetch_ipapi_for_ip?(event.ip_address) - # Use tracking network for fetch status to avoid race conditions - if tracking_network.is_fetching_api_data?(:ipapi) - Rails.logger.info "Skipping IPAPI fetch for #{tracking_network.cidr} - already being fetched" - else - tracking_network.mark_as_fetching_api_data!(:ipapi) + # Atomically mark as fetching - this prevents duplicate jobs via database lock + if tracking_network.mark_as_fetching_api_data!(:ipapi) Rails.logger.info "Queueing IPAPI fetch for IP #{event.ip_address} (tracking network: #{tracking_network.cidr})" FetchIpapiDataJob.perform_later(network_range_id: tracking_network.id) + else + Rails.logger.info "Skipping IPAPI fetch for #{tracking_network.cidr} - another job already started" end else - Rails.logger.debug "Skipping IPAPI fetch for IP #{event.ip_address} - already queried recently" + Rails.logger.debug "Skipping IPAPI fetch for IP #{event.ip_address} - already queried or being fetched" end # Evaluate WAF policies inline if needed (lazy evaluation) diff --git a/app/jobs/sync_events_to_duckdb_job.rb b/app/jobs/sync_events_to_duckdb_job.rb new file mode 100644 index 0000000..5741a18 --- /dev/null +++ b/app/jobs/sync_events_to_duckdb_job.rb @@ -0,0 +1,89 @@ +# frozen_string_literal: true + +# Background job to sync events from PostgreSQL to DuckDB +# Runs every 5 minutes to keep analytics database up-to-date +# Uses watermark tracking to only sync new events +class SyncEventsToDuckdbJob < ApplicationJob + queue_as :default + + # Key for storing last sync timestamp in Rails cache + WATERMARK_CACHE_KEY = "duckdb_last_sync_time" + WATERMARK_TTL = 1.week + + # Overlap window to catch late-arriving events + SYNC_OVERLAP = 1.minute + + def perform + service = AnalyticsDuckdbService.instance + + # Determine where to start syncing + from_timestamp = determine_sync_start_time(service) + + Rails.logger.info "[DuckDB Sync] Starting sync from #{from_timestamp}" + + # Sync new events using PostgreSQL cursor + DuckDB Appender + # (setup_schema is called internally within sync_new_events) + count = service.sync_new_events(from_timestamp) + + # Update watermark if we synced any events + if count > 0 + update_last_sync_time + Rails.logger.info "[DuckDB Sync] Successfully synced #{count} events" + else + Rails.logger.info "[DuckDB Sync] No new events to sync" + end + rescue StandardError => e + Rails.logger.error "[DuckDB Sync] Job failed: #{e.message}" + Rails.logger.error e.backtrace.join("\n") + raise # Re-raise to mark job as failed in Solid Queue + end + + private + + # Determine timestamp to start syncing from + # Strategy: + # 1. First run (DuckDB empty): sync from oldest PostgreSQL event + # 2. Subsequent runs: sync from last watermark with overlap + def determine_sync_start_time(service) + oldest_duckdb = service.oldest_event_timestamp + + if oldest_duckdb.nil? + # DuckDB is empty - this is the first sync + # Start from oldest PostgreSQL event (or reasonable cutoff) + oldest_pg = Event.minimum(:timestamp) + + if oldest_pg.nil? + # No events in PostgreSQL at all + Rails.logger.warn "[DuckDB Sync] No events found in PostgreSQL" + 1.day.ago # Default to recent window + else + Rails.logger.info "[DuckDB Sync] First sync - starting from oldest event: #{oldest_pg}" + oldest_pg + end + else + # DuckDB has data - sync from last watermark with overlap + last_sync = Rails.cache.read(WATERMARK_CACHE_KEY) + + if last_sync.nil? + # Watermark not in cache (maybe cache expired or restarted) + # Fall back to newest event in DuckDB + newest_duckdb = service.newest_event_timestamp + start_time = newest_duckdb ? newest_duckdb - SYNC_OVERLAP : oldest_duckdb + Rails.logger.info "[DuckDB Sync] Watermark not found, using newest DuckDB event: #{start_time}" + start_time + else + # Normal case: use watermark with overlap to catch late arrivals + start_time = last_sync - SYNC_OVERLAP + Rails.logger.debug "[DuckDB Sync] Using watermark: #{last_sync} (with #{SYNC_OVERLAP}s overlap)" + start_time + end + end + end + + # Update last sync watermark in cache + def update_last_sync_time + now = Time.current + Rails.cache.write(WATERMARK_CACHE_KEY, now, expires_in: WATERMARK_TTL) + Rails.logger.debug "[DuckDB Sync] Updated watermark to #{now}" + end +end diff --git a/app/models/event.rb b/app/models/event.rb index c68e338..a374900 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -10,6 +10,17 @@ class Event < ApplicationRecord # Enums for fixed value sets # Canonical WAF action order - aligned with Rule and Agent models + # + # IMPORTANT: These values were swapped to match baffle-agent convention: + # - deny: 0 (blocked traffic) + # - allow: 1 (allowed traffic) + # + # When using raw integer values in queries: + # - waf_action = 0 -> denied/blocked requests + # - waf_action = 1 -> allowed requests + # - waf_action = 2 -> redirect requests + # - waf_action = 3 -> challenge requests + # - waf_action = 4 -> log-only requests enum :waf_action, { deny: 0, # deny/block allow: 1, # allow/pass @@ -341,11 +352,11 @@ class Event < ApplicationRecord end def blocked? - waf_action.in?(['block', 'deny']) + waf_action == 'deny' # deny = 0 end def allowed? - waf_action.in?(['allow', 'pass']) + waf_action == 'allow' # allow = 1 end def logged? diff --git a/app/models/event_ddb.rb b/app/models/event_ddb.rb new file mode 100644 index 0000000..564f268 --- /dev/null +++ b/app/models/event_ddb.rb @@ -0,0 +1,499 @@ +# frozen_string_literal: true + +require 'ostruct' + +# EventDdb - DuckDB-backed analytics queries for events +# Provides an ActiveRecord-like interface for querying DuckDB events table +# Falls back to PostgreSQL Event model if DuckDB is unavailable +class EventDdb + class << self + # Get DuckDB service + def service + AnalyticsDuckdbService.instance + end + + # Total events since timestamp + def count_since(start_time) + service.with_connection do |conn| + result = conn.query("SELECT COUNT(*) as count FROM events WHERE timestamp >= ?", start_time) + result.first&.first || 0 + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in count_since: #{e.message}" + nil # Fallback to PostgreSQL + end + + # Event breakdown by WAF action + def breakdown_by_action(start_time) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time) + SELECT waf_action, COUNT(*) as count + FROM events + WHERE timestamp >= ? + GROUP BY waf_action + SQL + + # Convert to hash like ActiveRecord .group.count returns + result.to_a.to_h { |row| [row["waf_action"], row["count"]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in breakdown_by_action: #{e.message}" + nil + end + + # Top countries with event counts + def top_countries(start_time, limit = 10) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT country, COUNT(*) as count + FROM events + WHERE timestamp >= ? AND country IS NOT NULL + GROUP BY country + ORDER BY count DESC + LIMIT ? + SQL + + # Return array of [country, count] tuples like ActiveRecord + result.to_a.map { |row| [row["country"], row["count"]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_countries: #{e.message}" + nil + end + + # Top blocked IPs + def top_blocked_ips(start_time, limit = 10) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT ip_address, COUNT(*) as count + FROM events + WHERE timestamp >= ? AND waf_action = 0 + GROUP BY ip_address + ORDER BY count DESC + LIMIT ? + SQL + + result.to_a.map { |row| [row["ip_address"], row["count"]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_blocked_ips: #{e.message}" + nil + end + + # Hourly timeline aggregation + def hourly_timeline(start_time, end_time) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, end_time) + SELECT + DATE_TRUNC('hour', timestamp) as hour, + COUNT(*) as count + FROM events + WHERE timestamp >= ? AND timestamp < ? + GROUP BY hour + ORDER BY hour + SQL + + # Convert to hash with Time keys like ActiveRecord + result.to_a.to_h { |row| [row["hour"], row["count"]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in hourly_timeline: #{e.message}" + nil + end + + # Top networks by traffic volume + # Returns array of arrays: [network_range_id, event_count, unique_ips] + def top_networks(start_time, limit = 50) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT + network_range_id, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips + FROM events + WHERE timestamp >= ? AND network_range_id IS NOT NULL + GROUP BY network_range_id + ORDER BY event_count DESC + LIMIT ? + SQL + + result.to_a + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_networks: #{e.message}" + nil + end + + # Top companies + # Returns array of OpenStruct objects with: company, event_count, unique_ips, network_count + def top_companies(start_time, limit = 20) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT + company, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips, + COUNT(DISTINCT network_range_id) as network_count + FROM events + WHERE timestamp >= ? AND company IS NOT NULL + GROUP BY company + ORDER BY event_count DESC + LIMIT ? + SQL + + # Convert arrays to OpenStruct for attribute access + result.to_a.map do |row| + OpenStruct.new( + company: row[0], + event_count: row[1], + unique_ips: row[2], + network_count: row[3] + ) + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_companies: #{e.message}" + nil + end + + # Top ASNs + # Returns array of OpenStruct objects with: asn, asn_org, event_count, unique_ips, network_count + def top_asns(start_time, limit = 15) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT + asn, + asn_org, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips, + COUNT(DISTINCT network_range_id) as network_count + FROM events + WHERE timestamp >= ? AND asn IS NOT NULL + GROUP BY asn, asn_org + ORDER BY event_count DESC + LIMIT ? + SQL + + # Convert arrays to OpenStruct for attribute access + result.to_a.map do |row| + OpenStruct.new( + asn: row[0], + asn_org: row[1], + event_count: row[2], + unique_ips: row[3], + network_count: row[4] + ) + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_asns: #{e.message}" + nil + end + + # Network type breakdown (datacenter, VPN, proxy, standard) + # Returns hash with network_type as key and hash of stats as value + def network_type_breakdown(start_time) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time) + SELECT + CASE + WHEN is_datacenter THEN 'datacenter' + WHEN is_vpn THEN 'vpn' + WHEN is_proxy THEN 'proxy' + ELSE 'standard' + END as network_type, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips, + COUNT(DISTINCT network_range_id) as network_count + FROM events + WHERE timestamp >= ? + GROUP BY network_type + SQL + + # Convert arrays to hash: network_type => { event_count, unique_ips, network_count } + result.to_a.to_h do |row| + [ + row[0], # network_type + { + "event_count" => row[1], + "unique_ips" => row[2], + "network_count" => row[3] + } + ] + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_type_breakdown: #{e.message}" + nil + end + + # Top countries with detailed stats (event count and unique IPs) + # Returns array of OpenStruct objects with: country, event_count, unique_ips + def top_countries_with_stats(start_time, limit = 15) + service.with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT + country, + COUNT(*) as event_count, + COUNT(DISTINCT ip_address) as unique_ips + FROM events + WHERE timestamp >= ? AND country IS NOT NULL + GROUP BY country + ORDER BY event_count DESC + LIMIT ? + SQL + + # Convert arrays to OpenStruct for attribute access + result.to_a.map do |row| + OpenStruct.new( + country: row[0], + event_count: row[1], + unique_ips: row[2] + ) + end + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in top_countries_with_stats: #{e.message}" + nil + end + + # Network type stats with formatted output matching controller expectations + # Returns hash with type keys containing label, networks, events, unique_ips, percentage + def network_type_stats(start_time) + service.with_connection do |conn| + # Get total events for percentage calculation + total_result = conn.query("SELECT COUNT(*) as total FROM events WHERE timestamp >= ?", start_time) + total_events = total_result.first&.first || 0 + + # Get breakdown by network type + breakdown = network_type_breakdown(start_time) + return nil unless breakdown + + # Format results with labels and percentages + results = {} + + { + 'datacenter' => 'Datacenter', + 'vpn' => 'VPN', + 'proxy' => 'Proxy', + 'standard' => 'Standard' + }.each do |type, label| + stats = breakdown[type] + event_count = stats ? stats["event_count"] : 0 + + results[type] = { + label: label, + networks: stats ? stats["network_count"] : 0, + events: event_count, + unique_ips: stats ? stats["unique_ips"] : 0, + percentage: total_events > 0 ? ((event_count.to_f / total_events) * 100).round(1) : 0 + } + end + + results + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_type_stats: #{e.message}" + nil + end + + # Network range traffic statistics + # Returns comprehensive stats for a given network range ID(s) + def network_traffic_stats(network_range_ids) + network_range_ids = Array(network_range_ids) + return nil if network_range_ids.empty? + + service.with_connection do |conn| + # Build IN clause with placeholders + placeholders = network_range_ids.map { "?" }.join(", ") + + # Get all stats in a single query + result = conn.query(<<~SQL, *network_range_ids) + SELECT + COUNT(*) as total_requests, + COUNT(DISTINCT ip_address) as unique_ips, + SUM(CASE WHEN waf_action = 0 THEN 1 ELSE 0 END) as blocked_requests, + SUM(CASE WHEN waf_action = 1 THEN 1 ELSE 0 END) as allowed_requests + FROM events + WHERE network_range_id IN (#{placeholders}) + SQL + + stats_row = result.first + return nil unless stats_row + + { + total_requests: stats_row[0] || 0, + unique_ips: stats_row[1] || 0, + blocked_requests: stats_row[2] || 0, + allowed_requests: stats_row[3] || 0 + } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_traffic_stats: #{e.message}" + nil + end + + # Top paths for network range(s) + def network_top_paths(network_range_ids, limit = 10) + network_range_ids = Array(network_range_ids) + return nil if network_range_ids.empty? + + service.with_connection do |conn| + # Build IN clause with placeholders + placeholders = network_range_ids.map { "?" }.join(", ") + + result = conn.query(<<~SQL, *network_range_ids, limit) + SELECT + request_path, + COUNT(*) as count + FROM events + WHERE network_range_id IN (#{placeholders}) + AND request_path IS NOT NULL + GROUP BY request_path + ORDER BY count DESC + LIMIT ? + SQL + + result.to_a.map { |row| [row[0], row[1]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_top_paths: #{e.message}" + nil + end + + # Top user agents for network range(s) + def network_top_user_agents(network_range_ids, limit = 5) + network_range_ids = Array(network_range_ids) + return nil if network_range_ids.empty? + + service.with_connection do |conn| + # Build IN clause with placeholders + placeholders = network_range_ids.map { "?" }.join(", ") + + result = conn.query(<<~SQL, *network_range_ids, limit) + SELECT + user_agent, + COUNT(*) as count + FROM events + WHERE network_range_id IN (#{placeholders}) + AND user_agent IS NOT NULL + GROUP BY user_agent + ORDER BY count DESC + LIMIT ? + SQL + + result.to_a.map { |row| [row[0], row[1]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_top_user_agents: #{e.message}" + nil + end + + # Full user agent tally for network range(s) + # Returns hash of user_agent => count for all agents in the network + def network_agent_tally(network_range_ids) + network_range_ids = Array(network_range_ids) + return nil if network_range_ids.empty? + + service.with_connection do |conn| + # Build IN clause with placeholders + placeholders = network_range_ids.map { "?" }.join(", ") + + result = conn.query(<<~SQL, *network_range_ids) + SELECT + user_agent, + COUNT(*) as count + FROM events + WHERE network_range_id IN (#{placeholders}) + AND user_agent IS NOT NULL + GROUP BY user_agent + SQL + + # Convert to hash matching Ruby .tally format + result.to_a.to_h { |row| [row[0], row[1]] } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in network_agent_tally: #{e.message}" + nil + end + + # Suspicious network activity patterns + # Detects high-volume networks, high deny rates, and distributed companies + def suspicious_patterns(start_time) + service.with_connection do |conn| + # High volume networks (5x average) + avg_query = conn.query(<<~SQL, start_time) + SELECT + AVG(event_count) as avg_events + FROM ( + SELECT network_range_id, COUNT(*) as event_count + FROM events + WHERE timestamp >= ? AND network_range_id IS NOT NULL + GROUP BY network_range_id + ) network_stats + SQL + + avg_events = avg_query.first&.first || 0 + threshold = avg_events * 5 + + high_volume = conn.query(<<~SQL, start_time, threshold) + SELECT + network_range_id, + COUNT(*) as event_count + FROM events + WHERE timestamp >= ? AND network_range_id IS NOT NULL + GROUP BY network_range_id + HAVING COUNT(*) > ? + ORDER BY event_count DESC + SQL + + # High deny rate networks (>50% blocked, min 10 requests) + high_deny = conn.query(<<~SQL, start_time) + SELECT + network_range_id, + SUM(CASE WHEN waf_action = 0 THEN 1 ELSE 0 END) as denied_count, + COUNT(*) as total_count + FROM events + WHERE timestamp >= ? AND network_range_id IS NOT NULL + GROUP BY network_range_id + HAVING CAST(SUM(CASE WHEN waf_action = 0 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) > 0.5 + AND COUNT(*) >= 10 + ORDER BY denied_count DESC + SQL + + # Distributed companies (appearing with 5+ unique IPs) + distributed_companies = conn.query(<<~SQL, start_time) + SELECT + company, + COUNT(DISTINCT ip_address) as ip_count + FROM events + WHERE timestamp >= ? AND company IS NOT NULL + GROUP BY company + HAVING COUNT(DISTINCT ip_address) > 5 + ORDER BY ip_count DESC + LIMIT 10 + SQL + + { + high_volume: { + count: high_volume.to_a.length, + networks: high_volume.to_a.map { |row| row[0] } # network_range_id + }, + high_deny_rate: { + count: high_deny.to_a.length, + network_ids: high_deny.to_a.map { |row| row[0] } # network_range_id + }, + distributed_companies: distributed_companies.to_a.map { |row| + { + company: row[0], # company name + subnets: row[1] # ip_count + } + } + } + end + rescue StandardError => e + Rails.logger.error "[EventDdb] Error in suspicious_patterns: #{e.message}" + nil + end + end +end diff --git a/app/models/network_range.rb b/app/models/network_range.rb index 0cf96b5..503b018 100644 --- a/app/models/network_range.rb +++ b/app/models/network_range.rb @@ -158,13 +158,26 @@ class NetworkRange < ApplicationRecord end def mark_as_fetching_api_data!(source) - self.network_data ||= {} - self.network_data['fetching_status'] ||= {} - self.network_data['fetching_status'][source.to_s] = { - 'started_at' => Time.current.to_f, - 'job_id' => SecureRandom.hex(8) - } - save! + # Use database-level locking to prevent race conditions + transaction do + # Reload with lock to get fresh data + lock! + + # Double-check that we're not already fetching + if is_fetching_api_data?(source) + Rails.logger.info "Another job already started fetching #{source} for #{cidr}" + return false + end + + self.network_data ||= {} + self.network_data['fetching_status'] ||= {} + self.network_data['fetching_status'][source.to_s] = { + 'started_at' => Time.current.to_f, + 'job_id' => SecureRandom.hex(8) + } + save! + true + end end def clear_fetching_status!(source) @@ -222,9 +235,29 @@ class NetworkRange < ApplicationRecord end def agent_tally - # Rails.cache.fetch("#{to_s}:agent_tally", expires_in: 5.minutes) do - events.map(&:user_agent).tally - # end + Rails.cache.fetch("#{cache_key}:agent_tally", expires_in: 5.minutes) do + # Use DuckDB for fast agent tally instead of loading all events into memory + if persisted? && events_count > 0 + # Include child network ranges to capture all traffic within this network block + network_ids = [id] + child_ranges.pluck(:id) + + # Try DuckDB first for much faster aggregation + duckdb_tally = with_duckdb_fallback { EventDdb.network_agent_tally(network_ids) } + duckdb_tally || {} + else + # Virtual network - fallback to PostgreSQL CIDR query + events.map(&:user_agent).tally + end + end + end + + # Helper method to try DuckDB first, fall back to PostgreSQL + def with_duckdb_fallback(&block) + result = yield + result.nil? ? nil : result # Return result or nil to trigger fallback + rescue StandardError => e + Rails.logger.warn "[NetworkRange] DuckDB query failed, falling back to PostgreSQL: #{e.message}" + nil # Return nil to trigger fallback end # Geographic lookup @@ -334,6 +367,9 @@ class NetworkRange < ApplicationRecord def self.should_fetch_ipapi_for_ip?(ip_address) tracking_network = find_or_create_tracking_network_for_ip(ip_address) + # Check if currently being fetched (prevents duplicate jobs) + return false if tracking_network.is_fetching_api_data?(:ipapi) + # Check if /24 has been queried recently queried_at = tracking_network.network_data&.dig('ipapi_queried_at') return true if queried_at.nil? diff --git a/app/models/rule.rb b/app/models/rule.rb index 58ac6f1..d5331e1 100644 --- a/app/models/rule.rb +++ b/app/models/rule.rb @@ -7,7 +7,7 @@ class Rule < ApplicationRecord # Rule enums (prefix needed to avoid rate_limit collision) # Canonical WAF action order - aligned with Agent and Event models - enum :waf_action, { deny: 0, allow: 1, redirect: 2, challenge: 3, log: 4 }, prefix: :action + enum :waf_action, { deny: 0, allow: 1, redirect: 2, challenge: 3, log: 4, add_header: 5 }, prefix: :action enum :waf_rule_type, { network: 0, rate_limit: 1, path_pattern: 2 }, prefix: :type SOURCES = %w[manual auto:scanner_detected auto:rate_limit_exceeded auto:bot_detected imported default manual:surgical_block manual:surgical_exception policy].freeze @@ -120,6 +120,10 @@ class Rule < ApplicationRecord action_challenge? end + def add_header_action? + action_add_header? + end + # Redirect/challenge convenience methods def redirect_url metadata_hash['redirect_url'] @@ -137,6 +141,14 @@ class Rule < ApplicationRecord metadata&.dig('challenge_message') end + def header_name + metadata&.dig('header_name') + end + + def header_value + metadata&.dig('header_value') + end + def related_surgical_rules if surgical_block? # Find the corresponding exception rule @@ -421,6 +433,12 @@ class Rule < ApplicationRecord if source&.start_with?('auto:') || source == 'default' self.user ||= User.find_by(role: 1) # admin role end + + # Set default header values for add_header action + if add_header_action? + self.metadata['header_name'] ||= 'X-Bot-Agent' + self.metadata['header_value'] ||= 'Unknown' + end end def calculate_priority_for_network_rules @@ -504,6 +522,13 @@ class Rule < ApplicationRecord if challenge_type_value && !%w[captcha javascript proof_of_work].include?(challenge_type_value) errors.add(:metadata, "challenge_type must be one of: captcha, javascript, proof_of_work") end + when "add_header" + unless metadata&.dig("header_name").present? + errors.add(:metadata, "must include 'header_name' for add_header action") + end + unless metadata&.dig("header_value").present? + errors.add(:metadata, "must include 'header_value' for add_header action") + end end end diff --git a/app/models/waf_policy.rb b/app/models/waf_policy.rb index ff39972..b5f4017 100644 --- a/app/models/waf_policy.rb +++ b/app/models/waf_policy.rb @@ -9,7 +9,7 @@ class WafPolicy < ApplicationRecord POLICY_TYPES = %w[country asn company network_type path_pattern].freeze # Actions - what to do when traffic matches this policy - ACTIONS = %w[allow deny redirect challenge].freeze + ACTIONS = %w[allow deny redirect challenge add_header].freeze # Associations belongs_to :user @@ -25,6 +25,7 @@ validate :targets_must_be_array validate :validate_targets_by_type validate :validate_redirect_configuration, if: :redirect_policy_action? validate :validate_challenge_configuration, if: :challenge_policy_action? + validate :validate_add_header_configuration, if: :add_header_policy_action? # Scopes scope :enabled, -> { where(enabled: true) } @@ -95,6 +96,10 @@ validate :targets_must_be_array policy_action == 'challenge' end + def add_header_policy_action? + policy_action == 'add_header' + end + # Lifecycle methods def active? enabled? && !expired? @@ -163,7 +168,7 @@ validate :targets_must_be_array priority: network_range.prefix_length ) - # Handle redirect/challenge specific data + # Handle redirect/challenge/add_header specific data if redirect_action? && additional_data['redirect_url'] rule.update!( metadata: rule.metadata.merge( @@ -178,6 +183,13 @@ validate :targets_must_be_array challenge_message: additional_data['challenge_message'] ) ) + elsif add_header_action? + rule.update!( + metadata: rule.metadata.merge( + header_name: additional_data['header_name'], + header_value: additional_data['header_value'] + ) + ) end rule @@ -212,7 +224,7 @@ validate :targets_must_be_array priority: 50 # Default priority for path rules ) - # Handle redirect/challenge specific data + # Handle redirect/challenge/add_header specific data if redirect_action? && additional_data['redirect_url'] rule.update!( metadata: rule.metadata.merge( @@ -227,6 +239,13 @@ validate :targets_must_be_array challenge_message: additional_data['challenge_message'] ) ) + elsif add_header_action? + rule.update!( + metadata: rule.metadata.merge( + header_name: additional_data['header_name'], + header_value: additional_data['header_value'] + ) + ) end rule @@ -346,6 +365,12 @@ validate :targets_must_be_array self.targets ||= [] self.additional_data ||= {} self.enabled = true if enabled.nil? + + # Set default header values for add_header action + if add_header_policy_action? + self.additional_data['header_name'] ||= 'X-Bot-Agent' + self.additional_data['header_value'] ||= 'Unknown' + end end def targets_must_be_array @@ -430,6 +455,15 @@ validate :targets_must_be_array end end + def validate_add_header_configuration + if additional_data['header_name'].blank? + errors.add(:additional_data, "must include 'header_name' for add_header action") + end + if additional_data['header_value'].blank? + errors.add(:additional_data, "must include 'header_value' for add_header action") + end + end + # Matching logic for different policy types def matches_country?(network_range) country = network_range.country || network_range.inherited_intelligence[:country] diff --git a/app/services/analytics_duckdb_service.rb b/app/services/analytics_duckdb_service.rb new file mode 100644 index 0000000..8664707 --- /dev/null +++ b/app/services/analytics_duckdb_service.rb @@ -0,0 +1,284 @@ +# frozen_string_literal: true + +# Service for managing DuckDB analytics database +# Provides fast analytical queries on events data using columnar storage +class AnalyticsDuckdbService + include Singleton + + DUCKDB_PATH = Rails.root.join("storage", "analytics.duckdb").to_s + BATCH_SIZE = 10_000 + + # Execute block with connection, ensuring database and connection are closed afterward + def with_connection + db = DuckDB::Database.open(DUCKDB_PATH) + conn = db.connect + yield conn + ensure + conn&.close + db&.close + end + + # Create events table if it doesn't exist (must be called within with_connection block) + def setup_schema(conn) + conn.execute(<<~SQL) + CREATE TABLE IF NOT EXISTS events ( + id BIGINT PRIMARY KEY, + timestamp TIMESTAMP NOT NULL, + ip_address VARCHAR, + network_range_id BIGINT, + country VARCHAR, + company VARCHAR, + asn INTEGER, + asn_org VARCHAR, + is_datacenter BOOLEAN, + is_vpn BOOLEAN, + is_proxy BOOLEAN, + waf_action INTEGER, + request_path VARCHAR, + user_agent VARCHAR + ) + SQL + + Rails.logger.info "[DuckDB] Schema setup complete" + end + + # Get timestamp of oldest event in DuckDB + # Returns nil if table is empty + def oldest_event_timestamp + with_connection do |conn| + result = conn.query("SELECT MIN(timestamp) as oldest FROM events") + first_row = result.first + first_row&.first # Returns the value or nil + end + rescue StandardError => e + Rails.logger.error "[DuckDB] Error getting oldest timestamp: #{e.message}" + nil + end + + # Get timestamp of newest event in DuckDB + # Returns nil if table is empty + def newest_event_timestamp + with_connection do |conn| + result = conn.query("SELECT MAX(timestamp) as newest FROM events") + first_row = result.first + first_row&.first # Returns the value or nil + end + rescue StandardError => e + Rails.logger.error "[DuckDB] Error getting newest timestamp: #{e.message}" + nil + end + + # Get maximum event ID already synced to DuckDB + def max_synced_id + with_connection do |conn| + result = conn.query("SELECT COALESCE(MAX(id), 0) as max_id FROM events") + first_row = result.first + first_row&.first || 0 + end + rescue StandardError => e + Rails.logger.error "[DuckDB] Error getting max ID: #{e.message}" + 0 + end + + # Sync new events from PostgreSQL to DuckDB + # Uses PostgreSQL cursor for memory-efficient streaming + # Uses Appender API for fast bulk inserts + # Filters by ID to avoid duplicates + def sync_new_events(from_timestamp) + total_synced = 0 + + with_connection do |conn| + # Ensure table exists + setup_schema(conn) + + # Get max ID already in DuckDB to avoid duplicates + max_id_result = conn.query("SELECT COALESCE(MAX(id), 0) as max_id FROM events") + max_id = max_id_result.first&.first || 0 + Rails.logger.info "[DuckDB] Syncing events from #{from_timestamp}, max_id=#{max_id}" + + start_time = Time.current + appender = nil + batch_count = 0 + + begin + # Use PostgreSQL cursor for memory-efficient streaming + Event.where("timestamp >= ? AND id > ?", from_timestamp, max_id) + .select( + :id, + :timestamp, + :ip_address, + :network_range_id, + :country, + :company, + :asn, + :asn_org, + :is_datacenter, + :is_vpn, + :is_proxy, + :waf_action, + :request_path, + :user_agent + ) + .order(:id) + .each_row(block_size: BATCH_SIZE) do |event_data| + # Create new appender for each batch + if batch_count % BATCH_SIZE == 0 + appender&.close # Close previous appender + appender = conn.appender("events") + end + + # Unpack event data from cursor row (Hash from each_row) + begin + appender.append_row( + event_data["id"], + event_data["timestamp"], + event_data["ip_address"]&.to_s, + event_data["network_range_id"], + event_data["country"], + event_data["company"], + event_data["asn"], + event_data["asn_org"], + event_data["is_datacenter"], + event_data["is_vpn"], + event_data["is_proxy"], + event_data["waf_action"], + event_data["request_path"], + event_data["user_agent"] + ) + rescue StandardError => e + Rails.logger.error "[DuckDB] Error appending event #{event_data['id']}: #{e.message}" + Rails.logger.error "[DuckDB] event_data = #{event_data.inspect}" + raise + end + + batch_count += 1 + total_synced += 1 + + # Log progress every BATCH_SIZE events + if batch_count % BATCH_SIZE == 0 + Rails.logger.info "[DuckDB] Synced batch (total: #{total_synced} events)" + end + end + + # Close final appender + appender&.close + + duration = Time.current - start_time + rate = total_synced / duration if duration > 0 + Rails.logger.info "[DuckDB] Sync complete: #{total_synced} events in #{duration.round(2)}s (~#{rate&.round(0)} events/sec)" + rescue StandardError => e + appender&.close rescue nil # Ensure appender is closed on error + Rails.logger.error "[DuckDB] Error syncing events: #{e.message}" + Rails.logger.error e.backtrace.join("\n") + raise # Re-raise to be caught by outer rescue + end + end + + total_synced + rescue StandardError => e + Rails.logger.error "[DuckDB] Sync failed: #{e.message}" + 0 + end + + # Execute analytical query on DuckDB + def query(sql, *params) + with_connection do |conn| + conn.query(sql, *params) + end + rescue StandardError => e + Rails.logger.error "[DuckDB] Query error: #{e.message}" + Rails.logger.error "SQL: #{sql}" + raise + end + + # Get event count in DuckDB + def event_count + with_connection do |conn| + result = conn.query("SELECT COUNT(*) as count FROM events") + first_row = result.first + first_row&.first || 0 + end + rescue StandardError => e + Rails.logger.error "[DuckDB] Error getting event count: #{e.message}" + 0 + end + + # Analytics query: Total events since timestamp + def total_events_since(start_time) + with_connection do |conn| + result = conn.query("SELECT COUNT(*) as count FROM events WHERE timestamp >= ?", start_time) + result.first&.first || 0 + end + end + + # Analytics query: Event breakdown by WAF action + def event_breakdown_by_action(start_time) + with_connection do |conn| + result = conn.query(<<~SQL, start_time) + SELECT waf_action, COUNT(*) as count + FROM events + WHERE timestamp >= ? + GROUP BY waf_action + SQL + + # Convert to hash like PostgreSQL returns + result.to_a.to_h { |row| [row["waf_action"], row["count"]] } + end + end + + # Analytics query: Top countries + def top_countries(start_time, limit = 10) + with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT country, COUNT(*) as count + FROM events + WHERE timestamp >= ? AND country IS NOT NULL + GROUP BY country + ORDER BY count DESC + LIMIT ? + SQL + + result.to_a.map { |row| [row["country"], row["count"]] } + end + end + + # Analytics query: Top blocked IPs + def top_blocked_ips(start_time, limit = 10) + with_connection do |conn| + result = conn.query(<<~SQL, start_time, limit) + SELECT ip_address, COUNT(*) as count + FROM events + WHERE timestamp >= ? AND waf_action = 0 + GROUP BY ip_address + ORDER BY count DESC + LIMIT ? + SQL + + result.to_a.map { |row| [row["ip_address"], row["count"]] } + end + end + + # Analytics query: Hourly timeline (events grouped by hour) + def hourly_timeline(start_time, end_time) + with_connection do |conn| + result = conn.query(<<~SQL, start_time, end_time) + SELECT + DATE_TRUNC('hour', timestamp) as hour, + COUNT(*) as count + FROM events + WHERE timestamp >= ? AND timestamp < ? + GROUP BY hour + ORDER BY hour + SQL + + # Convert to hash with Time keys like PostgreSQL + result.to_a.to_h { |row| [row["hour"], row["count"]] } + end + end + + # Close DuckDB connection (for cleanup/testing) + def close + @connection&.close + @connection = nil + end +end diff --git a/app/services/bot_network_range_importer.rb b/app/services/bot_network_range_importer.rb new file mode 100644 index 0000000..2bf0a01 --- /dev/null +++ b/app/services/bot_network_range_importer.rb @@ -0,0 +1,573 @@ +# frozen_string_literal: true + +# BotNetworkRangeImporter - Service for importing official bot network ranges +# +# Imports network ranges from official bot provider sources like: +# - Amazon AWS: https://ip-ranges.amazonaws.com/ip-ranges.json +# - Google: Official crawler IP lists +# - Microsoft/Bing: Bot network ranges +# - Anthropic: Service network ranges +# - OpenAI: Service network ranges +class BotNetworkRangeImporter + class ImportError < StandardError; end + + # Official sources for bot network ranges + BOT_SOURCES = { + amazon_aws: { + name: 'Amazon AWS', + url: 'https://ip-ranges.amazonaws.com/ip-ranges.json', + format: :json, + parser: :parse_aws_ranges, + description: 'Official AWS IP ranges including Amazonbot and other services' + }, + google: { + name: 'Google', + # Note: These URLs may need to be updated based on current Google documentation + urls: [ + 'https://developers.google.com/search/docs/files/googlebot.json', + 'https://developers.google.com/search/docs/files/special-crawlers.json' + ], + format: :json, + parser: :parse_google_ranges, + description: 'Googlebot and other Google crawler IP ranges' + }, + microsoft_bing: { + name: 'Microsoft Bing', + # Note: Microsoft may require web scraping or API access + url: 'https://www.bing.com/toolbox/bingbot.json', + format: :json, + parser: :parse_microsoft_ranges, + description: 'Bingbot and other Microsoft crawler IP ranges' + }, + anthropic: { + name: 'Anthropic Claude', + # Note: Anthropic ranges may need manual updates or different approach + url: 'https://docs.anthropic.com/claude/reference/ip_ranges', + format: :html, + parser: :parse_anthropic_ranges, + description: 'Anthropic Claude API service IP ranges' + }, + openai_searchbot: { + name: 'OpenAI SearchBot', + url: 'https://openai.com/searchbot.json', + format: :json, + parser: :parse_openai_ranges, + description: 'OpenAI SearchBot for ChatGPT search features' + }, + openai_chatgpt_user: { + name: 'OpenAI ChatGPT-User', + url: 'https://openai.com/chatgpt-user.json', + format: :json, + parser: :parse_openai_ranges, + description: 'OpenAI ChatGPT-User for user actions in ChatGPT and Custom GPTs' + }, + openai_gptbot: { + name: 'OpenAI GPTBot', + url: 'https://openai.com/gptbot.json', + format: :json, + parser: :parse_openai_ranges, + description: 'OpenAI GPTBot for training AI foundation models' + }, + cloudflare: { + name: 'Cloudflare', + urls: [ + 'https://www.cloudflare.com/ips-v4', + 'https://www.cloudflare.com/ips-v6' + ], + format: :text, + parser: :parse_cloudflare_ranges, + description: 'Cloudflare network ranges including their crawlers and services' + }, + facebook: { + name: 'Facebook/Meta', + url: 'https://developers.facebook.com/docs/sharing/webmasters/crawler/', + format: :html, + parser: :parse_facebook_ranges, + description: 'Facebook/Meta crawlers and bots' + }, + applebot: { + name: 'Applebot', + url: 'https://support.apple.com/en-us/HT204683', + format: :html, + parser: :parse_applebot_ranges, + description: 'Applebot crawler for Apple search and Siri' + }, + duckduckgo: { + name: 'DuckDuckBot', + url: 'https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/', + format: :html, + parser: :parse_duckduckgo_ranges, + description: 'DuckDuckGo search crawler' + } + }.freeze + + def self.import_from_source(source_key, options = {}) + source = BOT_SOURCES[source_key.to_sym] + raise ImportError, "Unknown source: #{source_key}" unless source + + puts "Importing bot network ranges from #{source[:name]}..." + + case source[:parser] + when :parse_aws_ranges + parse_aws_ranges(source, options) + when :parse_google_ranges + parse_google_ranges(source, options) + when :parse_microsoft_ranges + parse_microsoft_ranges(source, options) + when :parse_anthropic_ranges + parse_anthropic_ranges(source, options) + when :parse_openai_ranges + parse_openai_ranges(source, options) + when :parse_cloudflare_ranges + parse_cloudflare_ranges(source, options) + when :parse_facebook_ranges + parse_facebook_ranges(source, options) + when :parse_applebot_ranges + parse_applebot_ranges(source, options) + when :parse_duckduckgo_ranges + parse_duckduckgo_ranges(source, options) + else + raise ImportError, "Unknown parser: #{source[:parser]}" + end + end + + def self.import_all_sources(options = {}) + results = {} + + BOT_SOURCES.each do |source_key, source| + puts "\n" + "="*50 + puts "Processing #{source[:name]}..." + puts "="*50 + + begin + results[source_key] = import_from_source(source_key, options) + rescue => e + Rails.logger.error "Failed to import from #{source[:name]}: #{e.message}" + results[source_key] = { error: e.message, imported: 0 } + end + end + + puts "\n" + "="*50 + puts "Import Summary" + puts "="*50 + + results.each do |source, result| + if result[:error] + puts "#{source}: FAILED - #{result[:error]}" + else + puts "#{source}: SUCCESS - #{result[:imported]} ranges imported" + end + end + + results + end + + private + + # Amazon AWS IP ranges parser + def self.parse_aws_ranges(source, options = {}) + require 'net/http' + require 'uri' + + uri = URI.parse(source[:url]) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + http.read_timeout = 30 + + response = http.get(uri.request_uri) + raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200' + + data = JSON.parse(response.body) + imported_count = 0 + batch_size = options[:batch_size] || 1000 + batch = [] + + # Filter for relevant services (can be customized) + relevant_services = options[:aws_services] || ['AMAZON', 'ROUTE53', 'EC2', 'CLOUDFRONT'] + + data['prefixes'].each do |prefix| + # Focus on relevant services and regions + next unless relevant_services.include?(prefix['service']) + + network_range = { + network: prefix['ip_prefix'], + source: 'bot_import_amazon_aws', + asn: nil, # AWS doesn't provide ASN in this feed + asn_org: 'Amazon Web Services', + company: 'Amazon', + country: nil, + is_datacenter: true, + is_proxy: false, + is_vpn: false, + additional_data: { + aws_service: prefix['service'], + aws_region: prefix['region'], + aws_network_border_group: prefix['network_border_group'], + import_date: Time.current.iso8601 + }.to_json + } + + batch << network_range + + if batch.size >= batch_size + imported_count += import_batch(batch, 'Amazon AWS') + batch = [] + puts "Imported #{imported_count} AWS ranges..." + end + end + + # Import remaining records + if batch.any? + imported_count += import_batch(batch, 'Amazon AWS') + end + + puts "Amazon AWS import completed: #{imported_count} ranges imported" + { imported: imported_count, source: 'Amazon AWS' } + rescue Net::TimeoutError, Net::OpenTimeout => e + raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}" + rescue JSON::ParserError => e + raise ImportError, "Failed to parse AWS JSON response: #{e.message}" + end + + # Google crawler IP ranges parser + def self.parse_google_ranges(source, options = {}) + imported_count = 0 + + # Try each potential URL + urls = Array(source[:urls] || source[:url]) + + urls.each do |url| + begin + puts "Attempting to fetch Google ranges from: #{url}" + + uri = URI.parse(url) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + http.read_timeout = 30 + + response = http.get(uri.request_uri) + next unless response.code == '200' + + data = JSON.parse(response.body) + + batch_size = options[:batch_size] || 1000 + batch = [] + + # Parse Google crawler format (varies by file type) + if data.is_a?(Array) + data.each do |entry| + next unless entry['cidr'] || entry['prefix'] + + network_range = { + network: entry['cidr'] || entry['prefix'], + source: 'bot_import_google', + asn: nil, + asn_org: 'Google LLC', + company: 'Google', + country: nil, + is_datacenter: true, + is_proxy: false, + is_vpn: false, + additional_data: { + crawler_type: entry['crawler_type'] || 'unknown', + user_agent: entry['user_agent'], + import_date: Time.current.iso8601 + }.to_json + } + + batch << network_range + + if batch.size >= batch_size + imported_count += import_batch(batch, 'Google') + batch = [] + puts "Imported #{imported_count} Google ranges..." + end + end + end + + # Import remaining records + if batch.any? + imported_count += import_batch(batch, 'Google') + end + + puts "Google import completed: #{imported_count} ranges imported" + return { imported: imported_count, source: 'Google' } + + rescue => e + Rails.logger.warn "Failed to fetch from #{url}: #{e.message}" + next + end + end + + raise ImportError, "Failed to fetch Google crawler ranges from any URL" + end + + # Microsoft Bing crawler IP ranges parser + def self.parse_microsoft_ranges(source, options = {}) + # Microsoft requires special handling as they may not provide direct JSON + # This is a placeholder implementation + + puts "Microsoft Bing crawler import requires manual configuration or web scraping" + puts "Refer to: https://www.bing.com/webmaster/help/which-crawlers-does-bing-use" + + { + imported: 0, + source: 'Microsoft Bing', + note: 'Manual configuration required - Microsoft does not provide direct IP range feeds' + } + end + + # Anthropic service IP ranges parser + def self.parse_anthropic_ranges(source, options = {}) + # Anthropic ranges may need to be manually configured + # This is a placeholder implementation + + puts "Anthropic Claude service ranges require manual configuration" + puts "Refer to: https://docs.anthropic.com/claude/reference/ip_ranges" + + { + imported: 0, + source: 'Anthropic', + note: 'Manual configuration required - Anthropic does not provide automated IP range feeds' + } + end + + # OpenAI crawler IP ranges parser + def self.parse_openai_ranges(source, options = {}) + require 'net/http' + require 'uri' + + uri = URI.parse(source[:url]) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + http.read_timeout = 30 + + response = http.get(uri.request_uri) + raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200' + + data = JSON.parse(response.body) + imported_count = 0 + batch_size = options[:batch_size] || 1000 + batch = [] + + # Determine crawler type from source name + crawler_type = source[:name].gsub('OpenAI ', '').downcase + + data.each do |entry| + # OpenAI provides IP ranges as either CIDR notation or single IPs + ip_range = entry['cidr'] || entry['ip_prefix'] || entry['ip'] + next unless ip_range + + # Convert single IPs to /32 + network = ip_range.include?('/') ? ip_range : "#{ip_range}/32" + + network_range = { + network: network, + source: "bot_import_openai_#{crawler_type}", + asn: nil, + asn_org: 'OpenAI', + company: 'OpenAI', + country: nil, + is_datacenter: true, + is_proxy: false, + is_vpn: false, + additional_data: { + crawler_type: crawler_type, + crawler_purpose: crawler_purpose(crawler_type), + user_agent: openai_user_agent(crawler_type), + import_date: Time.current.iso8601, + source_url: source[:url] + }.to_json + } + + batch << network_range + + if batch.size >= batch_size + imported_count += import_batch(batch, "OpenAI #{crawler_type}") + batch = [] + puts "Imported #{imported_count} OpenAI #{crawler_type} ranges..." + end + end + + # Import remaining records + if batch.any? + imported_count += import_batch(batch, "OpenAI #{crawler_type}") + end + + puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported" + { imported: imported_count, source: "OpenAI #{crawler_type}" } + rescue Net::TimeoutError, Net::OpenTimeout => e + raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}" + rescue JSON::ParserError => e + raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}" + end + + def self.import_batch(batch_data, source_name) + # Check for existing ranges to avoid duplicates + existing_networks = NetworkRange.where(network: batch_data.map { |d| d[:network] }).pluck(:network) + new_ranges = batch_data.reject { |d| existing_networks.include?(d[:network]) } + + if new_ranges.any? + NetworkRange.insert_all(new_ranges) + puts "Imported #{new_ranges.size} new #{source_name} ranges (#{batch_data.size - new_ranges.size} duplicates skipped)" + else + puts "No new #{source_name} ranges to import (all duplicates)" + end + + new_ranges.size + rescue => e + Rails.logger.error "Failed to import #{source_name} batch: #{e.message}" + + # Fallback to individual imports + imported = 0 + new_ranges.each do |data| + begin + NetworkRange.create!(data) + imported += 1 + rescue => individual_error + Rails.logger.error "Failed to import individual #{source_name} record: #{individual_error.message}" + end + end + + imported + end + + # Helper method to determine crawler purpose based on type + def self.crawler_purpose(crawler_type) + case crawler_type + when 'searchbot' + 'Used to link to and surface websites in search results in ChatGPT\'s search features' + when 'chatgpt-user' + 'User actions in ChatGPT and Custom GPTs, including GPT Actions' + when 'gptbot' + 'Used to crawl content for training OpenAI\'s generative AI foundation models' + else + 'Unknown purpose' + end + end + + # Helper method to get OpenAI user agent strings + def self.openai_user_agent(crawler_type) + case crawler_type + when 'searchbot' + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot' + when 'chatgpt-user' + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot' + when 'gptbot' + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot' + else + 'Unknown user agent' + end + end + + # Cloudflare IP ranges parser + def self.parse_cloudflare_ranges(source, options = {}) + require 'net/http' + require 'uri' + + imported_count = 0 + urls = Array(source[:urls]) + batch_size = options[:batch_size] || 1000 + batch = [] + + urls.each do |url| + begin + puts "Fetching Cloudflare ranges from: #{url}" + + uri = URI.parse(url) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + http.read_timeout = 30 + + response = http.get(uri.request_uri) + raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200' + + # Cloudflare provides plain text CIDR lists + lines = response.body.split("\n") + ip_version = url.include?('ips-v4') ? 4 : 6 + + lines.each do |line| + line = line.strip + next if line.empty? || line.start_with?('#') + + # Validate CIDR format + next unless line.match?(/\A[0-9a-fA-F:.]+\/\d+\z/) + + network_range = { + network: line, + source: 'bot_import_cloudflare', + asn: nil, + asn_org: 'Cloudflare', + company: 'Cloudflare', + country: nil, + is_datacenter: true, + is_proxy: false, + is_vpn: false, + additional_data: { + ip_version: ip_version, + import_date: Time.current.iso8601, + source_url: url, + service_type: 'cdn_and_security' + }.to_json + } + + batch << network_range + + if batch.size >= batch_size + imported_count += import_batch(batch, 'Cloudflare') + batch = [] + puts "Imported #{imported_count} Cloudflare ranges..." + end + end + + rescue => e + Rails.logger.warn "Failed to fetch Cloudflare ranges from #{url}: #{e.message}" + next + end + end + + # Import remaining records + if batch.any? + imported_count += import_batch(batch, 'Cloudflare') + end + + puts "Cloudflare import completed: #{imported_count} ranges imported" + { imported: imported_count, source: 'Cloudflare' } + end + + # Facebook/Meta crawler ranges parser (placeholder) + def self.parse_facebook_ranges(source, options = {}) + puts "Facebook/Meta crawler ranges require web scraping or manual configuration" + puts "Refer to: https://developers.facebook.com/docs/sharing/webmasters/crawler/" + + { + imported: 0, + source: 'Facebook/Meta', + note: 'Manual configuration required - Facebook does not provide automated IP range feeds' + } + end + + # Applebot crawler ranges parser (placeholder) + def self.parse_applebot_ranges(source, options = {}) + puts "Applebot ranges require web scraping or manual configuration" + puts "Refer to: https://support.apple.com/en-us/HT204683" + + { + imported: 0, + source: 'Applebot', + note: 'Manual configuration required - Apple does not provide automated IP range feeds' + } + end + + # DuckDuckBot crawler ranges parser (placeholder) + def self.parse_duckduckgo_ranges(source, options = {}) + puts "DuckDuckBot ranges require web scraping or manual configuration" + puts "Refer to: https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/" + + { + imported: 0, + source: 'DuckDuckBot', + note: 'Manual configuration required - DuckDuckGo does not provide automated IP range feeds' + } + end +end \ No newline at end of file diff --git a/app/services/ipapi.rb b/app/services/ipapi.rb index 1afc1bd..2f5ed80 100644 --- a/app/services/ipapi.rb +++ b/app/services/ipapi.rb @@ -53,4 +53,107 @@ class Ipapi next end end + + # Parse company/datacenter network range from IPAPI data + # Handles "X.X.X.X - Y.Y.Y.Y" format and converts to CIDR + def self.parse_company_network_range(ipapi_data) + # Try company.network first, then datacenter.network + network_range = ipapi_data.dig('company', 'network') || ipapi_data.dig('datacenter', 'network') + return nil if network_range.blank? + + # Parse "X.X.X.X - Y.Y.Y.Y" format + if network_range.include?(' - ') + start_ip_str, end_ip_str = network_range.split(' - ').map(&:strip) + + begin + start_ip = IPAddr.new(start_ip_str) + end_ip = IPAddr.new(end_ip_str) + + # Calculate the number of IPs in the range + num_ips = end_ip.to_i - start_ip.to_i + 1 + + # Calculate prefix length from number of IPs + # num_ips = 2^(32 - prefix_length) for IPv4 + prefix_length = 32 - Math.log2(num_ips).to_i + + # Verify it's a valid CIDR block (power of 2) + if 2**(32 - prefix_length) == num_ips + cidr = "#{start_ip_str}/#{prefix_length}" + Rails.logger.debug "Parsed company network range: #{network_range} -> #{cidr}" + return cidr + else + Rails.logger.warn "Network range #{network_range} is not a valid CIDR block (#{num_ips} IPs)" + return nil + end + rescue IPAddr::InvalidAddressError => e + Rails.logger.error "Invalid IP in company network range: #{network_range} (#{e.message})" + return nil + end + elsif network_range.include?('/') + # Already in CIDR format + return network_range + else + Rails.logger.warn "Unknown network range format: #{network_range}" + return nil + end + end + + # Populate NetworkRange attributes from IPAPI data + def self.populate_network_attributes(network_range, ipapi_data) + network_range.asn = ipapi_data.dig('asn', 'asn') + network_range.asn_org = ipapi_data.dig('asn', 'org') || ipapi_data.dig('company', 'name') + network_range.company = ipapi_data.dig('company', 'name') + network_range.country = ipapi_data.dig('location', 'country_code') + network_range.is_datacenter = ipapi_data['is_datacenter'] || false + network_range.is_vpn = ipapi_data['is_vpn'] || false + network_range.is_proxy = ipapi_data['is_proxy'] || false + end + + # Process IPAPI data and create network ranges + # Returns array of created/updated NetworkRange objects + def self.process_ipapi_data(ipapi_data, tracking_network) + created_networks = [] + + # Extract and create company/datacenter network range if present + company_network_cidr = parse_company_network_range(ipapi_data) + if company_network_cidr.present? + company_range = NetworkRange.find_or_create_by(network: company_network_cidr) do |nr| + nr.source = 'api_imported' + nr.creation_reason = "Company allocation from IPAPI for #{tracking_network.cidr}" + end + + # Always update attributes (whether new or existing) + populate_network_attributes(company_range, ipapi_data) + company_range.set_network_data(:ipapi, ipapi_data) + company_range.last_api_fetch = Time.current + company_range.save! + + created_networks << company_range + Rails.logger.info "Created/updated company network: #{company_range.cidr}" + end + + # Extract and create ASN route network if present + ipapi_route = ipapi_data.dig('asn', 'route') + if ipapi_route.present? && ipapi_route != tracking_network.cidr + route_network = NetworkRange.find_or_create_by(network: ipapi_route) do |nr| + nr.source = 'api_imported' + nr.creation_reason = "BGP route from IPAPI lookup for #{tracking_network.cidr}" + end + + # Always update attributes (whether new or existing) + populate_network_attributes(route_network, ipapi_data) + route_network.set_network_data(:ipapi, ipapi_data) + route_network.last_api_fetch = Time.current + route_network.save! + + created_networks << route_network + Rails.logger.info "Created/updated BGP route network: #{route_network.cidr}" + end + + # Return both the created networks and the broadest CIDR for deduplication + { + networks: created_networks, + broadest_cidr: company_network_cidr.presence || ipapi_route || tracking_network.cidr + } + end end \ No newline at end of file diff --git a/app/views/analytics/networks.html.erb b/app/views/analytics/networks.html.erb index 8dd7f37..7c8fa8e 100644 --- a/app/views/analytics/networks.html.erb +++ b/app/views/analytics/networks.html.erb @@ -141,8 +141,11 @@ class: "text-blue-600 hover:text-blue-800 hover:underline font-mono font-medium" %>
Import and manage official network ranges for search crawlers and API bots
+<%= source[:description] %>
+ +Import from all available sources (this may take several minutes).
+ + <%= form_with url: import_all_bot_network_ranges_path, method: :post do |f| %> +| Source | +Status | +Records | +Date | +Notes | +
|---|---|---|---|---|
| + <%= import.source.titleize %> + | ++ + <%= import.status.titleize %> + + | ++ <%= import.records_processed&.to_s || '0' %> + | ++ <%= import.created_at.strftime('%Y-%m-%d %H:%M') %> + | ++ <%= import.notes %> + | +
| Network | +Source | +Company | +Created | +Details | +
|---|---|---|---|---|
| + <%= range.network %> + | ++ <%= range.source.gsub('bot_import_', '').titleize %> + | ++ <%= range.company || 'Unknown' %> + | ++ <%= range.created_at.strftime('%Y-%m-%d %H:%M') %> + | ++ <% if range.additional_data.present? %> + <% data = JSON.parse(range.additional_data) rescue {} %> + <% if data['crawler_type'] %> + + <%= data['crawler_type'].titleize %> + + <% end %> + <% if data['aws_service'] %> + + <%= data['aws_service'] %> + + <% end %> + <% end %> + | +
Network ranges imported from <%= @source_name %> official sources
+| Network | +Source | +Company | +Country | +Created | +Details | +
|---|---|---|---|---|---|
| + <%= link_to range.network, network_range_path(range), class: "text-blue-600 hover:text-blue-800" %> + | ++ <%= range.source.gsub('bot_import_', '').titleize %> + | ++ <%= range.company || 'Unknown' %> + | ++ <%= range.country || 'Unknown' %> + | ++ <%= range.created_at.strftime('%Y-%m-%d %H:%M') %> + | +
+ <% if range.additional_data.present? %>
+ <% data = JSON.parse(range.additional_data) rescue {} %>
+
+ <% if data['crawler_type'] %>
+
+ <%= data['crawler_type'].titleize %>
+
+ <% end %>
+ <% if data['crawler_purpose'] %>
+
+ Purpose
+
+ <% end %>
+ <% if data['aws_service'] %>
+
+ <%= data['aws_service'] %>
+
+ <% end %>
+ <% if data['aws_region'] %>
+
+ <%= data['aws_region'] %>
+
+ <% end %>
+ <% if data['ip_version'] %>
+
+ IPv<%= data['ip_version'] %>
+
+ <% end %>
+
+ <% end %>
+ |
+
+ No <%= @source_name %> network ranges have been imported yet. +
+ <%= link_to "Import #{@source_name} Ranges", bot_network_ranges_path, + class: "inline-flex items-center px-4 py-2 border border-transparent text-sm font-medium rounded-md shadow-sm text-white bg-blue-600 hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-blue-500" %> +Create a WAF rule to allow, block, or rate limit traffic
-What action to take when this rule matches
The HTTP header name to add (e.g., X-Bot-Agent, X-Network-Type)
+The value for the header (e.g., BingBot, GoogleBot, Unknown)
+Leave blank for permanent rule
+When this rule should automatically expire
+