# frozen_string_literal: true class Event < ApplicationRecord # Normalized association for hosts (most valuable compression) belongs_to :request_host, optional: true # WAF rule associations belongs_to :rule, optional: true has_one :waf_policy, through: :rule # Enums for fixed value sets # Canonical WAF action order - aligned with Rule and Agent models # # IMPORTANT: These values were swapped to match baffle-agent convention: # - deny: 0 (blocked traffic) # - allow: 1 (allowed traffic) # # When using raw integer values in queries: # - waf_action = 0 -> denied/blocked requests # - waf_action = 1 -> allowed requests # - waf_action = 2 -> redirect requests # - waf_action = 3 -> challenge requests # - waf_action = 4 -> log-only requests enum :waf_action, { deny: 0, # deny/block allow: 1, # allow/pass redirect: 2, # redirect challenge: 3, # challenge (CAPTCHA, JS challenge, etc.) log: 4 # log only, no action (monitoring mode) }, default: :allow, scopes: false enum :request_method, { get: 0, # GET post: 1, # POST put: 2, # PUT patch: 3, # PATCH delete: 4, # DELETE head: 5, # HEAD options: 6 # OPTIONS }, default: :get, scopes: false # Serialize segment IDs as array for easy manipulation in Railssqit serialize :request_segment_ids, type: Array, coder: JSON # Tags are stored as JSON arrays with PostgreSQL jsonb type # This provides direct array access and efficient indexing attribute :tags, :json, default: -> { [] } validates :request_id, presence: true, uniqueness: true validates :timestamp, presence: true scope :recent, -> { order(timestamp: :desc) } scope :by_ip, ->(ip) { where(ip_address: ip) } scope :by_user_agent, ->(user_agent) { where(user_agent: user_agent) } scope :by_waf_action, ->(waf_action) { where(waf_action: waf_action) } scope :blocked, -> { where(waf_action: :deny) } scope :allowed, -> { where(waf_action: :allow) } scope :logged, -> { where(waf_action: :log) } # Tag-based filtering scopes using PostgreSQL array operators scope :with_tag, ->(tag) { where("tags @> ARRAY[?]", tag.to_s) } scope :with_any_tags, ->(tags) { return none if tags.blank? tag_array = Array(tags).map(&:to_s) where("tags && ARRAY[?]", tag_array) } scope :with_all_tags, ->(tags) { return none if tags.blank? tag_array = Array(tags).map(&:to_s) where("tags @> ARRAY[?]", tag_array) } # Network-based filtering scopes - now using denormalized columns scope :by_company, ->(company) { where("company ILIKE ?", "%#{company}%") } scope :by_country, ->(country) { where(country: country) } scope :by_network_type, ->(type) { case type.to_s.downcase when "datacenter" where(is_datacenter: true) when "vpn" where(is_vpn: true) when "proxy" where(is_proxy: true) when "standard" where(is_datacenter: false, is_vpn: false, is_proxy: false) else none end } scope :by_asn, ->(asn) { where(asn: asn.to_i) } scope :by_network_cidr, ->(cidr) { # This still requires a join since we need to match CIDR joins(:network_range).where("network_ranges.network = ?", cidr) } # Bot filtering scopes scope :bots, -> { where(is_bot: true) } scope :humans, -> { where(is_bot: false) } scope :exclude_bots, -> { where(is_bot: false) } # Add association for the optional network_range_id belongs_to :network_range, optional: true # Path prefix matching using range queries (uses B-tree index efficiently) scope :with_path_prefix, ->(prefix_segment_ids) { return none if prefix_segment_ids.blank? # Use range queries instead of LIKE for index usage # Example: [1,2] prefix matches [1,2], [1,2,3], [1,2,3,4], etc. prefix_str = prefix_segment_ids.to_json # "[1,2]" # For exact match: request_segment_ids = "[1,2]" # For prefix match: "[1,2," <= request_segment_ids < "[1,3," # This works because JSON arrays sort lexicographically # Build the range upper bound by incrementing last segment ID upper_prefix = prefix_segment_ids[0..-2] + [prefix_segment_ids.last + 1] upper_str = upper_prefix.to_json lower_prefix_str = "#{prefix_str[0..-2]}," # "[1,2," - matches longer paths # Use raw SQL to bypass serializer (it expects Array but we're comparing strings) where("request_segment_ids = ? OR (request_segment_ids >= ? AND request_segment_ids < ?)", prefix_str, lower_prefix_str, upper_str) } # Path depth queries scope :path_depth, ->(depth) { where("json_array_length(request_segment_ids) = ?", depth) } scope :path_depth_greater_than, ->(depth) { where("json_array_length(request_segment_ids) > ?", depth) } # Analytics: Get response time percentiles over different time windows def self.response_time_percentiles(windows: { hour: 1.hour, day: 1.day, week: 1.week }) results = {} windows.each do |label, duration| scope = where('timestamp >= ?', duration.ago) stats = scope.pick( Arel.sql(<<~SQL.squish) percentile_cont(0.5) WITHIN GROUP (ORDER BY response_time_ms) as p50, percentile_cont(0.95) WITHIN GROUP (ORDER BY response_time_ms) as p95, percentile_cont(0.99) WITHIN GROUP (ORDER BY response_time_ms) as p99, COUNT(*) as count SQL ) results[label] = if stats { p50: stats[0]&.round(2), p95: stats[1]&.round(2), p99: stats[2]&.round(2), count: stats[3] } else { p50: nil, p95: nil, p99: nil, count: 0 } end end results end # Helper methods def path_depth request_segment_ids&.length || 0 end def reconstructed_path return request_path if request_segment_ids.blank? segments = PathSegment.where(id: request_segment_ids).index_by(&:id) '/' + request_segment_ids.map { |id| segments[id]&.segment }.compact.join('/') end # Extract key fields from payload before saving before_validation :extract_fields_from_payload # Normalize event fields after extraction after_validation :normalize_event_fields, if: :should_normalize? # Populate network intelligence from IP address before_save :populate_network_intelligence, if: :should_populate_network_intelligence? # Detect bot traffic using user agent and network intelligence before_save :detect_bot_traffic, if: :should_detect_bot? # Backfill network intelligence for all events def self.backfill_network_intelligence!(batch_size: 10_000) total = where(country: nil).count return 0 if total.zero? puts "Backfilling network intelligence for #{total} events..." processed = 0 where(country: nil).find_in_batches(batch_size: batch_size) do |batch| batch.each(&:save) # Triggers before_save callback processed += batch.size puts " Processed #{processed}/#{total} (#{(processed.to_f / total * 100).round(1)}%)" end processed end # Backfill network intelligence for a specific event def backfill_network_intelligence! populate_network_intelligence save! end def self.create_from_waf_payload!(request_id, payload) # Normalize headers in payload during import phase normalized_payload = normalize_payload_headers(payload) # Create the WAF request event create!( request_id: request_id, timestamp: parse_timestamp(normalized_payload["timestamp"]), payload: normalized_payload, # WAF-specific fields ip_address: normalized_payload.dig("request", "ip"), user_agent: normalized_payload.dig("request", "headers", "user-agent") || normalized_payload.dig("request", "headers", "User-Agent"), # request_method will be set by extract_fields_from_payload + normalize_event_fields request_path: normalized_payload.dig("request", "path"), request_url: normalized_payload.dig("request", "url"), # request_protocol will be set by extract_fields_from_payload + normalize_event_fields response_status: normalized_payload.dig("response", "status_code"), response_time_ms: normalized_payload.dig("response", "duration_ms"), waf_action: normalize_action(normalized_payload["waf_action"]), # Normalize incoming action values # Support both new (rule_id) and old (rule_matched) field names during cutover rule_id: normalized_payload["rule_id"] || normalized_payload["rule_matched"], blocked_reason: normalized_payload["blocked_reason"], # Server/Environment info server_name: normalized_payload["server_name"], environment: normalized_payload["environment"], # WAF agent info agent_version: normalized_payload.dig("agent", "version"), agent_name: normalized_payload.dig("agent", "name") ) end # Normalize headers in payload to lower case during import phase def self.normalize_payload_headers(payload) return payload unless payload.is_a?(Hash) # Create a deep copy to avoid modifying the original normalized = Marshal.load(Marshal.dump(payload)) # Normalize request headers if normalized.dig("request", "headers")&.is_a?(Hash) normalized["request"]["headers"] = normalized["request"]["headers"].transform_keys(&:downcase) end # Normalize response headers if they exist if normalized.dig("response", "headers")&.is_a?(Hash) normalized["response"]["headers"] = normalized["response"]["headers"].transform_keys(&:downcase) end normalized end def self.normalize_action(action) return "allow" if action.nil? || action.blank? case action.to_s.downcase when "allow", "pass", "allowed" "allow" when "deny", "block", "blocked", "deny_access" "deny" when "challenge" "challenge" when "redirect" "redirect" else Rails.logger.warn "Unknown action '#{action}', defaulting to 'allow'" "allow" end end def self.parse_timestamp(timestamp) case timestamp when String Time.parse(timestamp) when Numeric # Sentry timestamps can be in seconds with decimals Time.at(timestamp) when Time timestamp else Time.current end rescue => e Rails.logger.error "Failed to parse timestamp #{timestamp}: #{e.message}" Time.current end def request_details return {} unless payload.present? request_data = payload.dig("request") || {} { ip: request_data["ip"], method: request_data["method"], path: request_data["path"], url: request_data["url"], protocol: request_data["protocol"], headers: request_data["headers"] || {}, query: request_data["query"] || {}, body_size: request_data["body_size"] } end def response_details return {} unless payload.present? response_data = payload.dig("response") || {} { status_code: response_data["status_code"], duration_ms: response_data["duration_ms"], size: response_data["size"] } end def geo_details return {} unless payload.present? payload.dig("geo") || {} end def tags # Use the dedicated tags column (array), fallback to payload during transition super.presence || (payload&.dig("tags") || []) end def headers raw_headers = payload&.dig("request", "headers") || {} normalize_headers(raw_headers) end def query_params payload&.dig("request", "query") || {} end def blocked? waf_action == 'deny' # deny = 0 end def allowed? waf_action == 'allow' # allow = 1 end def logged? waf_action == 'log' end def challenged? waf_action == 'challenge' end def rule_matched? rule_id.present? end # New path methods for normalization def path_segments return [] unless request_path.present? request_path.split('/').reject(&:blank?) end def path_segments_array @path_segments_array ||= path_segments end def request_hostname return nil unless request_url.present? URI.parse(request_url).hostname rescue nil end # Tag helper methods def add_tag(tag) tag_str = tag.to_s self.tags = (tags + [tag_str]).uniq unless tags.include?(tag_str) end def remove_tag(tag) tag_str = tag.to_s self.tags = tags - [tag_str] if tags.include?(tag_str) end def has_tag?(tag) tags.include?(tag.to_s) end def tag_list tags.join(', ') end # Normalize headers to lower case keys during import phase def normalize_headers(headers) return {} unless headers.is_a?(Hash) headers.transform_keys(&:downcase) end # Network range resolution methods def matching_network_ranges return [] unless ip_address.present? NetworkRange.contains_ip(ip_address.to_s).map do |range| { range: range, cidr: range.cidr, prefix_length: range.prefix_length, specificity: range.prefix_length, intelligence: range.inherited_intelligence } end.sort_by { |r| -r[:specificity] } # Most specific first end def most_specific_range # Use the cached network_range_id if available (much faster) return NetworkRange.find_by(id: network_range_id) if network_range_id.present? # Fallback to expensive lookup matching_network_ranges.first&.dig(:range) end def broadest_range matching_network_ranges.last&.dig(:range) end def network_intelligence # Use denormalized fields instead of expensive lookup { country: country, company: company, asn: asn, asn_org: asn_org, is_datacenter: is_datacenter, is_vpn: is_vpn, is_proxy: is_proxy } end # Denormalized attribute accessors - these now use the columns directly # No need to override - Rails provides these automatically: # - country (column) # - company (column) # - asn (column) # - asn_org (column) # - is_datacenter (column) # - is_vpn (column) # - is_proxy (column) # IP validation def valid_ipv4? return false unless ip_address.present? IPAddr.new(ip_address).ipv4? rescue IPAddr::InvalidAddressError false end def valid_ipv6? return false unless ip_address.present? IPAddr.new(ip_address).ipv6? rescue IPAddr::InvalidAddressError false end def valid_ip? valid_ipv4? || valid_ipv6? end # Rules affecting this IP def matching_rules return Rule.none unless ip_address.present? # Get all network ranges that contain this IP range_ids = matching_network_ranges.map { |r| r[:range].id } # Find rules for those ranges, ordered by priority (most specific first) Rule.network_rules .where(network_range_id: range_ids) .enabled .includes(:network_range) .order('masklen(network_ranges.network) DESC') end def active_blocking_rules matching_rules.where(waf_action: :deny) end def has_blocking_rules? active_blocking_rules.exists? end # Get full geo location details from network range def geo_location network_info = network_intelligence { country_code: network_info[:country], ip_address: ip_address, has_data: network_info[:country].present?, network_intelligence: network_info } end # Check if event has valid geo location data via network range def has_geo_data? network_intelligence[:country].present? end # Lookup country code for this event's IP via network range def lookup_country return nil if ip_address.blank? network_info = network_intelligence network_info[:country] rescue => e Rails.logger.error "Network lookup failed for #{ip_address}: #{e.message}" nil end private def should_normalize? request_host_id.nil? || request_segment_ids.blank? end def normalize_event_fields EventNormalizer.normalize_event!(self) rescue => e Rails.logger.error "Failed to normalize event #{id}: #{e.message}" end def should_populate_network_intelligence? # Only populate if IP is present and country is not yet set # Also repopulate if IP address changed (rare case) ip_address.present? && (country.blank? || ip_address_changed?) end def populate_network_intelligence return unless ip_address.present? # Convert IPAddr to string for PostgreSQL query ip_string = ip_address.to_s # CRITICAL: Always find_or_create /24 tracking network for public IPs # This /24 serves as: # 1. The tracking unit for IPAPI deduplication (stores ipapi_queried_at) # 2. The reference point for preventing duplicate API calls # 3. The fallback network if no more specific GeoIP data exists tracking_network = find_or_create_tracking_network(ip_string) # Find most specific network range with actual GeoIP data # This might be more specific (e.g., /25) or broader (e.g., /22) than the /24 data_range = NetworkRange.where("network >>= ?", ip_string) .where.not(country: nil) # Must have actual data .order(Arel.sql("masklen(network) DESC")) .first # Use the most specific range with data, or fall back to tracking network range = data_range || tracking_network if range # Populate all network intelligence fields from the range self.country = range.country self.company = range.company self.asn = range.asn self.asn_org = range.asn_org self.is_datacenter = range.is_datacenter || false self.is_vpn = range.is_vpn || false self.is_proxy = range.is_proxy || false else # No range at all (shouldn't happen, but defensive) self.is_datacenter = false self.is_vpn = false self.is_proxy = false end # ALWAYS set network_range_id to the tracking /24 # This is what FetchIpapiDataJob uses to check ipapi_queried_at # and prevent duplicate API calls self.network_range_id = tracking_network&.id rescue => e Rails.logger.error "Failed to populate network intelligence for event #{id}: #{e.message}" # Set defaults on error to prevent null values self.is_datacenter = false self.is_vpn = false self.is_proxy = false end # Find or create the /24 tracking network for this IP # This is the fundamental unit for IPAPI deduplication def find_or_create_tracking_network(ip_string) return nil if private_or_reserved_ip?(ip_string) ip_addr = IPAddr.new(ip_string) # Calculate /24 for IPv4, /64 for IPv6 if ip_addr.ipv4? prefix_length = 24 mask = (2**32 - 1) ^ ((2**(32 - prefix_length)) - 1) network_int = ip_addr.to_i & mask network_base = IPAddr.new(network_int, Socket::AF_INET) network_cidr = "#{network_base}/#{prefix_length}" # e.g., "1.2.3.0/24" else prefix_length = 64 mask = (2**128 - 1) ^ ((2**(128 - prefix_length)) - 1) network_int = ip_addr.to_i & mask network_base = IPAddr.new(network_int, Socket::AF_INET6) network_cidr = "#{network_base}/#{prefix_length}" # e.g., "2001:db8::/64" end # Find or create the tracking network NetworkRange.find_or_create_by!(network: network_cidr) do |nr| nr.source = 'auto_generated' nr.creation_reason = 'tracking unit for IPAPI deduplication' nr.is_datacenter = NetworkRangeGenerator.datacenter_ip?(ip_addr) rescue false nr.is_vpn = false nr.is_proxy = false end rescue => e Rails.logger.error "Failed to create tracking network for IP #{ip_string}: #{e.message}" nil end # Check if IP is private or reserved (should not create network ranges) def private_or_reserved_ip?(ip_string = nil) ip_str = ip_string || ip_address.to_s ip = IPAddr.new(ip_str) # Private and reserved ranges [ IPAddr.new('10.0.0.0/8'), IPAddr.new('172.16.0.0/12'), IPAddr.new('192.168.0.0/16'), IPAddr.new('127.0.0.0/8'), IPAddr.new('169.254.0.0/16'), IPAddr.new('224.0.0.0/4'), IPAddr.new('240.0.0.0/4'), IPAddr.new('::1/128'), IPAddr.new('fc00::/7'), IPAddr.new('fe80::/10'), IPAddr.new('ff00::/8') ].any? { |range| range.include?(ip) } rescue IPAddr::InvalidAddressError true # Treat invalid IPs as "reserved" end def extract_fields_from_payload return unless payload.present? # Extract WAF-specific fields for direct querying request_data = payload.dig("request") || {} response_data = payload.dig("response") || {} self.ip_address = request_data["ip"] # Extract user agent with header name standardization headers = request_data["headers"] || {} normalized_headers = normalize_headers(headers) self.user_agent = normalized_headers["user-agent"] || normalized_headers["User-Agent"] self.request_path = request_data["path"] self.request_url = request_data["url"] self.response_status = response_data["status_code"] self.response_time_ms = response_data["duration_ms"] # Support both new (rule_id) and old (rule_matched) field names during cutover self.rule_id = payload["rule_id"] || payload["rule_matched"] self.blocked_reason = payload["blocked_reason"] # Store original values for normalization only if they don't exist yet # This prevents overwriting during multiple callback runs @raw_request_method ||= request_data["method"] @raw_request_protocol ||= request_data["protocol"] @raw_action ||= payload["waf_action"] # Extract server/environment info self.server_name = payload["server_name"] self.environment = payload["environment"] # Extract agent info agent_data = payload.dig("agent") || {} self.agent_version = agent_data["version"] self.agent_name = agent_data["name"] end def should_detect_bot? # Detect bots if user agent is present or if we have network intelligence user_agent.present? || network_range_id.present? end def detect_bot_traffic self.is_bot = bot_detected? rescue => e Rails.logger.error "Failed to detect bot for event #{id}: #{e.message}" self.is_bot = false # Default to non-bot on error end def bot_detected? # Multi-signal bot detection approach with tagging: # 1. User agent detection (DeviceDetector gem) - adds bot:name tag # 2. Network range source matching (bot_import_* sources) - adds network tags # 3. Fallback to datacenter classification for infrastructure-based detection # Signal 1: User agent bot detection (uses DeviceDetector's built-in cache) if user_agent.present? begin detector = DeviceDetector.new(user_agent) if detector.bot? # Add bot tag with specific bot name bot_name = detector.bot_name&.downcase&.gsub(/\s+/, '_') || 'unknown' add_tag("bot:#{bot_name}") return true end rescue => e Rails.logger.debug "DeviceDetector failed for user agent: #{e.message}" end end # Signal 2: Network range from known bot sources if network_range_id.present? range = NetworkRange.find_by(id: network_range_id) if range # Check if the network range source indicates a bot import if range.source&.start_with?('bot_import_') # Extract bot type from source (e.g., 'bot_import_googlebot' -> 'googlebot') bot_type = range.source.sub('bot_import_', '') add_tag("bot:#{bot_type}") add_tag("network:#{range.company&.downcase&.gsub(/\s+/, '_')}") if range.company.present? return true end # Check if the company is a known bot provider (from bot imports) # Common bot companies: Google, Amazon, OpenAI, Cloudflare, Microsoft, etc. known_bot_companies = ['googlebot', 'google bot', 'amazon', 'aws', 'openai', 'anthropic', 'cloudflare', 'microsoft', 'facebook', 'meta', 'apple', 'duckduckgo'] company_lower = company&.downcase if company_lower && known_bot_companies.any? { |bot| company_lower.include?(bot) } add_tag("bot:#{company_lower.gsub(/\s+/, '_')}") add_tag("network:#{company_lower.gsub(/\s+/, '_')}") return true end end end # Signal 3: Datacenter traffic is often bot traffic # However, this is less precise so we use it as a weaker signal # Only mark as bot if datacenter AND has other suspicious characteristics if is_datacenter && user_agent.present? # Generic/common bot user agents in datacenter networks ua_lower = user_agent.downcase bot_keywords = ['bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python', 'go-http-client'] if bot_keywords.any? { |keyword| ua_lower.include?(keyword) } add_tag("bot:datacenter") add_tag("datacenter:true") return true end end # Default: not a bot false end end