Compare commits

...

3 Commits

23 changed files with 682 additions and 140 deletions

View File

@@ -27,7 +27,7 @@ RUN apt-get update -qq && \
*) \ *) \
echo "Unsupported platform: $TARGETPLATFORM" && exit 1 ;; \ echo "Unsupported platform: $TARGETPLATFORM" && exit 1 ;; \
esac && \ esac && \
wget "https://install.duckdb.org/v1.4.2/libduckdb-linux-${DUCKDB_ARCH}.zip" -O /tmp/libduckdb.zip && \ wget "https://github.com/duckdb/duckdb/releases/download/v1.4.2/libduckdb-linux-${DUCKDB_ARCH}.zip" -O /tmp/libduckdb.zip && \
unzip /tmp/libduckdb.zip -d /tmp/duckdb && \ unzip /tmp/libduckdb.zip -d /tmp/duckdb && \
cp /tmp/duckdb/duckdb.h /tmp/duckdb/duckdb.hpp /usr/local/include/ && \ cp /tmp/duckdb/duckdb.h /tmp/duckdb/duckdb.hpp /usr/local/include/ && \
cp /tmp/duckdb/libduckdb.so /usr/local/lib/ && \ cp /tmp/duckdb/libduckdb.so /usr/local/lib/ && \

View File

@@ -1 +1 @@
0.2.0 0.2.2

View File

@@ -7,6 +7,9 @@ class AnalyticsController < ApplicationController
def index def index
authorize :analytics, :index? authorize :analytics, :index?
# Track overall request time
request_start = Time.current
# Time period selector (default: last 24 hours) # Time period selector (default: last 24 hours)
@time_period = params[:period]&.to_sym || :day @time_period = params[:period]&.to_sym || :day
@start_time = calculate_start_time(@time_period) @start_time = calculate_start_time(@time_period)
@@ -24,10 +27,12 @@ class AnalyticsController < ApplicationController
cache_key_base = "analytics/#{@time_period}/#{@start_time.to_i}" cache_key_base = "analytics/#{@time_period}/#{@start_time.to_i}"
# Core statistics - cached (uses DuckDB if available) # Core statistics - cached (uses DuckDB if available)
stat_start = Time.current
@total_events = Rails.cache.fetch("#{cache_key_base}/total_events", expires_in: cache_ttl) do @total_events = Rails.cache.fetch("#{cache_key_base}/total_events", expires_in: cache_ttl) do
with_duckdb_fallback { EventDdb.count_since(@start_time) } || with_duckdb_fallback { EventDdb.count_since(@start_time) } ||
Event.where("timestamp >= ?", @start_time).count Event.where("timestamp >= ?", @start_time).count
end end
Rails.logger.info "[Analytics Perf] Total events: #{((Time.current - stat_start) * 1000).round(1)}ms"
@total_rules = Rails.cache.fetch("analytics/total_rules", expires_in: 5.minutes) do @total_rules = Rails.cache.fetch("analytics/total_rules", expires_in: 5.minutes) do
Rule.enabled.count Rule.enabled.count
@@ -42,14 +47,17 @@ class AnalyticsController < ApplicationController
end end
# Event breakdown by action - cached (uses DuckDB if available) # Event breakdown by action - cached (uses DuckDB if available)
stat_start = Time.current
@event_breakdown = Rails.cache.fetch("#{cache_key_base}/event_breakdown", expires_in: cache_ttl) do @event_breakdown = Rails.cache.fetch("#{cache_key_base}/event_breakdown", expires_in: cache_ttl) do
with_duckdb_fallback { EventDdb.breakdown_by_action(@start_time) } || with_duckdb_fallback { EventDdb.breakdown_by_action(@start_time) } ||
Event.where("timestamp >= ?", @start_time) Event.where("timestamp >= ?", @start_time)
.group(:waf_action) .group(:waf_action)
.count .count
end end
Rails.logger.info "[Analytics Perf] Event breakdown: #{((Time.current - stat_start) * 1000).round(1)}ms"
# Top countries by event count - cached (uses DuckDB if available) # Top countries by event count - cached (uses DuckDB if available)
stat_start = Time.current
@top_countries = Rails.cache.fetch("#{cache_key_base}/top_countries", expires_in: cache_ttl) do @top_countries = Rails.cache.fetch("#{cache_key_base}/top_countries", expires_in: cache_ttl) do
with_duckdb_fallback { EventDdb.top_countries(@start_time, 10) } || with_duckdb_fallback { EventDdb.top_countries(@start_time, 10) } ||
Event.where("timestamp >= ? AND country IS NOT NULL", @start_time) Event.where("timestamp >= ? AND country IS NOT NULL", @start_time)
@@ -58,8 +66,10 @@ class AnalyticsController < ApplicationController
.sort_by { |_, count| -count } .sort_by { |_, count| -count }
.first(10) .first(10)
end end
Rails.logger.info "[Analytics Perf] Top countries: #{((Time.current - stat_start) * 1000).round(1)}ms"
# Top blocked IPs - cached (uses DuckDB if available) # Top blocked IPs - cached (uses DuckDB if available)
stat_start = Time.current
@top_blocked_ips = Rails.cache.fetch("#{cache_key_base}/top_blocked_ips", expires_in: cache_ttl) do @top_blocked_ips = Rails.cache.fetch("#{cache_key_base}/top_blocked_ips", expires_in: cache_ttl) do
with_duckdb_fallback { EventDdb.top_blocked_ips(@start_time, 10) } || with_duckdb_fallback { EventDdb.top_blocked_ips(@start_time, 10) } ||
Event.where("timestamp >= ?", @start_time) Event.where("timestamp >= ?", @start_time)
@@ -69,6 +79,7 @@ class AnalyticsController < ApplicationController
.sort_by { |_, count| -count } .sort_by { |_, count| -count }
.first(10) .first(10)
end end
Rails.logger.info "[Analytics Perf] Top blocked IPs: #{((Time.current - stat_start) * 1000).round(1)}ms"
# Network range intelligence breakdown - cached # Network range intelligence breakdown - cached
@network_intelligence = Rails.cache.fetch("analytics/network_intelligence", expires_in: 10.minutes) do @network_intelligence = Rails.cache.fetch("analytics/network_intelligence", expires_in: 10.minutes) do
@@ -105,7 +116,11 @@ class AnalyticsController < ApplicationController
end end
# Prepare data for charts - split caching for current vs historical data # Prepare data for charts - split caching for current vs historical data
stat_start = Time.current
@chart_data = prepare_chart_data_with_split_cache(cache_key_base, cache_ttl) @chart_data = prepare_chart_data_with_split_cache(cache_key_base, cache_ttl)
Rails.logger.info "[Analytics Perf] Chart data: #{((Time.current - stat_start) * 1000).round(1)}ms"
Rails.logger.info "[Analytics Perf] TOTAL REQUEST: #{((Time.current - request_start) * 1000).round(1)}ms"
respond_to do |format| respond_to do |format|
format.html format.html

View File

@@ -36,6 +36,9 @@ class EventsController < ApplicationController
@events = @events.by_asn(params[:asn]) if params[:asn].present? @events = @events.by_asn(params[:asn]) if params[:asn].present?
@events = @events.by_network_cidr(params[:network_cidr]) if params[:network_cidr].present? @events = @events.by_network_cidr(params[:network_cidr]) if params[:network_cidr].present?
# Bot filtering
@events = @events.exclude_bots if params[:exclude_bots] == "true"
Rails.logger.debug "Events count after filtering: #{@events.count}" Rails.logger.debug "Events count after filtering: #{@events.count}"
# Debug info # Debug info

View File

@@ -261,12 +261,6 @@ def process_quick_create_parameters
# Ensure metadata is a hash # Ensure metadata is a hash
@rule.metadata = {} unless @rule.metadata.is_a?(Hash) @rule.metadata = {} unless @rule.metadata.is_a?(Hash)
# Handle add_header fields - use provided params or existing metadata values
if @rule.add_header_action? && (params[:header_name].present? || params[:header_value].present?)
@rule.metadata['header_name'] = params[:header_name].presence || @rule.metadata['header_name'] || 'X-Bot-Agent'
@rule.metadata['header_value'] = params[:header_value].presence || @rule.metadata['header_value'] || 'Unknown'
end
# Handle expires_at parsing for text input # Handle expires_at parsing for text input
if params.dig(:rule, :expires_at).present? if params.dig(:rule, :expires_at).present?
expires_at_str = params[:rule][:expires_at].strip expires_at_str = params[:rule][:expires_at].strip

View File

@@ -105,6 +105,11 @@ class Event < ApplicationRecord
joins(:network_range).where("network_ranges.network = ?", cidr) joins(:network_range).where("network_ranges.network = ?", cidr)
} }
# Bot filtering scopes
scope :bots, -> { where(is_bot: true) }
scope :humans, -> { where(is_bot: false) }
scope :exclude_bots, -> { where(is_bot: false) }
# Add association for the optional network_range_id # Add association for the optional network_range_id
belongs_to :network_range, optional: true belongs_to :network_range, optional: true
@@ -191,6 +196,9 @@ class Event < ApplicationRecord
# Populate network intelligence from IP address # Populate network intelligence from IP address
before_save :populate_network_intelligence, if: :should_populate_network_intelligence? before_save :populate_network_intelligence, if: :should_populate_network_intelligence?
# Detect bot traffic using user agent and network intelligence
before_save :detect_bot_traffic, if: :should_detect_bot?
# Backfill network intelligence for all events # Backfill network intelligence for all events
def self.backfill_network_intelligence!(batch_size: 10_000) def self.backfill_network_intelligence!(batch_size: 10_000)
total = where(country: nil).count total = where(country: nil).count
@@ -218,8 +226,8 @@ class Event < ApplicationRecord
# Normalize headers in payload during import phase # Normalize headers in payload during import phase
normalized_payload = normalize_payload_headers(payload) normalized_payload = normalize_payload_headers(payload)
# Create the WAF request event # Create the WAF request event with agent-provided tags
create!( event = create!(
request_id: request_id, request_id: request_id,
timestamp: parse_timestamp(normalized_payload["timestamp"]), timestamp: parse_timestamp(normalized_payload["timestamp"]),
payload: normalized_payload, payload: normalized_payload,
@@ -242,11 +250,18 @@ class Event < ApplicationRecord
server_name: normalized_payload["server_name"], server_name: normalized_payload["server_name"],
environment: normalized_payload["environment"], environment: normalized_payload["environment"],
# Tags: start with agent-provided tags only
tags: normalized_payload["tags"] || [],
# WAF agent info # WAF agent info
agent_version: normalized_payload.dig("agent", "version"), agent_version: normalized_payload.dig("agent", "version"),
agent_name: normalized_payload.dig("agent", "name") agent_name: normalized_payload.dig("agent", "name")
) )
# Apply rule tags using EventTagger service
EventTagger.tag_event(event)
event
end end
# Normalize headers in payload to lower case during import phase # Normalize headers in payload to lower case during import phase
@@ -339,7 +354,10 @@ class Event < ApplicationRecord
def tags def tags
# Use the dedicated tags column (array), fallback to payload during transition # Use the dedicated tags column (array), fallback to payload during transition
super.presence || (payload&.dig("tags") || []) # Ensure we always return an Array, even if payload has malformed data (e.g., {} instead of [])
result = super.presence || payload&.dig("tags")
return [] if result.nil?
result.is_a?(Array) ? result : []
end end
def headers def headers
@@ -699,4 +717,82 @@ class Event < ApplicationRecord
self.agent_version = agent_data["version"] self.agent_version = agent_data["version"]
self.agent_name = agent_data["name"] self.agent_name = agent_data["name"]
end end
def should_detect_bot?
# Detect bots if user agent is present or if we have network intelligence
user_agent.present? || network_range_id.present?
end
def detect_bot_traffic
self.is_bot = bot_detected?
rescue => e
Rails.logger.error "Failed to detect bot for event #{id}: #{e.message}"
self.is_bot = false # Default to non-bot on error
end
def bot_detected?
# Multi-signal bot detection approach with tagging:
# 1. User agent detection (DeviceDetector gem) - adds bot:name tag
# 2. Network range source matching (bot_import_* sources) - adds network tags
# 3. Fallback to datacenter classification for infrastructure-based detection
# Signal 1: User agent bot detection (uses DeviceDetector's built-in cache)
if user_agent.present?
begin
detector = DeviceDetector.new(user_agent)
if detector.bot?
# Add bot tag with specific bot name
bot_name = detector.bot_name&.downcase&.gsub(/\s+/, '_') || 'unknown'
add_tag("bot:#{bot_name}")
return true
end
rescue => e
Rails.logger.debug "DeviceDetector failed for user agent: #{e.message}"
end
end
# Signal 2: Network range from known bot sources
if network_range_id.present?
range = NetworkRange.find_by(id: network_range_id)
if range
# Check if the network range source indicates a bot import
if range.source&.start_with?('bot_import_')
# Extract bot type from source (e.g., 'bot_import_googlebot' -> 'googlebot')
bot_type = range.source.sub('bot_import_', '')
add_tag("bot:#{bot_type}")
add_tag("network:#{range.company&.downcase&.gsub(/\s+/, '_')}") if range.company.present?
return true
end
# Check if the company is a known bot provider (from bot imports)
# Common bot companies: Google, Amazon, OpenAI, Cloudflare, Microsoft, etc.
known_bot_companies = ['googlebot', 'google bot', 'amazon', 'aws', 'openai',
'anthropic', 'cloudflare', 'microsoft', 'facebook',
'meta', 'apple', 'duckduckgo']
company_lower = company&.downcase
if company_lower && known_bot_companies.any? { |bot| company_lower.include?(bot) }
add_tag("bot:#{company_lower.gsub(/\s+/, '_')}")
add_tag("network:#{company_lower.gsub(/\s+/, '_')}")
return true
end
end
end
# Signal 3: Datacenter traffic is often bot traffic
# However, this is less precise so we use it as a weaker signal
# Only mark as bot if datacenter AND has other suspicious characteristics
if is_datacenter && user_agent.present?
# Generic/common bot user agents in datacenter networks
ua_lower = user_agent.downcase
bot_keywords = ['bot', 'crawler', 'spider', 'scraper', 'curl', 'wget', 'python', 'go-http-client']
if bot_keywords.any? { |keyword| ua_lower.include?(keyword) }
add_tag("bot:datacenter")
add_tag("datacenter:true")
return true
end
end
# Default: not a bot
false
end
end end

View File

@@ -34,7 +34,10 @@ class EventDdb
SQL SQL
# Convert to hash like ActiveRecord .group.count returns # Convert to hash like ActiveRecord .group.count returns
result.to_a.to_h { |row| [row["waf_action"], row["count"]] } # DuckDB returns integer enum values, map to string names
# 0=deny, 1=allow, 2=redirect, 3=challenge, 4=log
action_map = { 0 => "deny", 1 => "allow", 2 => "redirect", 3 => "challenge", 4 => "log" }
result.to_a.to_h { |row| [action_map[row[0]] || "unknown", row[1]] }
end end
rescue StandardError => e rescue StandardError => e
Rails.logger.error "[EventDdb] Error in breakdown_by_action: #{e.message}" Rails.logger.error "[EventDdb] Error in breakdown_by_action: #{e.message}"
@@ -54,7 +57,8 @@ class EventDdb
SQL SQL
# Return array of [country, count] tuples like ActiveRecord # Return array of [country, count] tuples like ActiveRecord
result.to_a.map { |row| [row["country"], row["count"]] } # DuckDB returns arrays: [country, count]
result.to_a.map { |row| [row[0], row[1]] }
end end
rescue StandardError => e rescue StandardError => e
Rails.logger.error "[EventDdb] Error in top_countries: #{e.message}" Rails.logger.error "[EventDdb] Error in top_countries: #{e.message}"
@@ -73,7 +77,8 @@ class EventDdb
LIMIT ? LIMIT ?
SQL SQL
result.to_a.map { |row| [row["ip_address"], row["count"]] } # DuckDB returns arrays: [ip_address, count]
result.to_a.map { |row| [row[0], row[1]] }
end end
rescue StandardError => e rescue StandardError => e
Rails.logger.error "[EventDdb] Error in top_blocked_ips: #{e.message}" Rails.logger.error "[EventDdb] Error in top_blocked_ips: #{e.message}"
@@ -94,7 +99,8 @@ class EventDdb
SQL SQL
# Convert to hash with Time keys like ActiveRecord # Convert to hash with Time keys like ActiveRecord
result.to_a.to_h { |row| [row["hour"], row["count"]] } # DuckDB returns arrays: [hour, count]
result.to_a.to_h { |row| [row[0], row[1]] }
end end
rescue StandardError => e rescue StandardError => e
Rails.logger.error "[EventDdb] Error in hourly_timeline: #{e.message}" Rails.logger.error "[EventDdb] Error in hourly_timeline: #{e.message}"
@@ -495,5 +501,128 @@ class EventDdb
Rails.logger.error "[EventDdb] Error in suspicious_patterns: #{e.message}" Rails.logger.error "[EventDdb] Error in suspicious_patterns: #{e.message}"
nil nil
end end
# Bot traffic analysis - breakdown of bot vs human traffic
def bot_traffic_breakdown(start_time)
service.with_connection do |conn|
result = conn.query(<<~SQL, start_time)
SELECT
is_bot,
COUNT(*) as event_count,
COUNT(DISTINCT ip_address) as unique_ips
FROM events
WHERE timestamp >= ?
GROUP BY is_bot
SQL
# Convert to hash: is_bot => { event_count, unique_ips }
# DuckDB returns arrays: [is_bot, event_count, unique_ips]
result.to_a.to_h do |row|
[
row[0] ? "bot" : "human", # row[0] = is_bot
{
"event_count" => row[1], # row[1] = event_count
"unique_ips" => row[2] # row[2] = unique_ips
}
]
end
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in bot_traffic_breakdown: #{e.message}"
nil
end
# Count human traffic (non-bot) since timestamp
def human_traffic_count(start_time)
service.with_connection do |conn|
result = conn.query(<<~SQL, start_time)
SELECT COUNT(*) as count
FROM events
WHERE timestamp >= ? AND is_bot = false
SQL
result.first&.first || 0
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in human_traffic_count: #{e.message}"
nil
end
# Count bot traffic since timestamp
def bot_traffic_count(start_time)
service.with_connection do |conn|
result = conn.query(<<~SQL, start_time)
SELECT COUNT(*) as count
FROM events
WHERE timestamp >= ? AND is_bot = true
SQL
result.first&.first || 0
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in bot_traffic_count: #{e.message}"
nil
end
# Top bot user agents
def top_bot_user_agents(start_time, limit = 20)
service.with_connection do |conn|
result = conn.query(<<~SQL, start_time, limit)
SELECT
user_agent,
COUNT(*) as event_count,
COUNT(DISTINCT ip_address) as unique_ips
FROM events
WHERE timestamp >= ? AND is_bot = true AND user_agent IS NOT NULL
GROUP BY user_agent
ORDER BY event_count DESC
LIMIT ?
SQL
# DuckDB returns arrays: [user_agent, event_count, unique_ips]
result.to_a.map do |row|
{
user_agent: row[0], # row[0] = user_agent
event_count: row[1], # row[1] = event_count
unique_ips: row[2] # row[2] = unique_ips
}
end
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in top_bot_user_agents: #{e.message}"
nil
end
# Bot traffic timeline (hourly breakdown)
def bot_traffic_timeline(start_time, end_time)
service.with_connection do |conn|
result = conn.query(<<~SQL, start_time, end_time)
SELECT
DATE_TRUNC('hour', timestamp) as hour,
SUM(CASE WHEN is_bot = true THEN 1 ELSE 0 END) as bot_count,
SUM(CASE WHEN is_bot = false THEN 1 ELSE 0 END) as human_count
FROM events
WHERE timestamp >= ? AND timestamp < ?
GROUP BY hour
ORDER BY hour
SQL
# Convert to hash with Time keys
# DuckDB returns arrays: [hour, bot_count, human_count]
result.to_a.to_h do |row|
[
row[0], # row[0] = hour
{
"bot_count" => row[1], # row[1] = bot_count
"human_count" => row[2], # row[2] = human_count
"total" => row[1] + row[2]
}
]
end
end
rescue StandardError => e
Rails.logger.error "[EventDdb] Error in bot_traffic_timeline: #{e.message}"
nil
end
end end
end end

View File

@@ -7,7 +7,11 @@
# and classification flags (datacenter, proxy, VPN). # and classification flags (datacenter, proxy, VPN).
class NetworkRange < ApplicationRecord class NetworkRange < ApplicationRecord
# Sources for network range creation # Sources for network range creation
SOURCES = %w[api_imported user_created manual auto_generated inherited geolite_asn geolite_country].freeze SOURCES = %w[api_imported user_created manual auto_generated inherited geolite_asn geolite_country
bot_import_amazon_aws bot_import_google bot_import_microsoft_bing bot_import_anthropic
bot_import_openai_searchbot bot_import_openai_chatgpt_user bot_import_openai_gptbot
bot_import_cloudflare bot_import_facebook bot_import_applebot bot_import_duckduckgo
production_import].freeze
# Associations # Associations
has_many :rules, dependent: :destroy has_many :rules, dependent: :destroy
@@ -116,19 +120,19 @@ class NetworkRange < ApplicationRecord
# Parent/child relationships # Parent/child relationships
def parent_ranges def parent_ranges
NetworkRange.where("?::inet << network AND masklen(network) < ?", network.to_s, prefix_length) # Find networks that contain this network (less specific / shorter prefix)
.order("masklen(network) DESC") # The << operator implicitly means the containing network has a shorter prefix
# IMPORTANT: Use cidr (not network.to_s) to preserve the network mask
NetworkRange.where("?::inet << network", cidr)
.order("masklen(network) DESC") # Most specific parent first
end end
def child_ranges def child_ranges
NetworkRange.where("network >> ?::inet AND masklen(network) > ?", network.to_s, prefix_length) # Find networks that are contained by this network (more specific / longer prefix)
.order("masklen(network) ASC") # The >> operator implicitly means the contained network has a longer prefix
end # IMPORTANT: Use cidr (not network.to_s) to preserve the network mask
NetworkRange.where("?::inet >> network", cidr)
def sibling_ranges .order("masklen(network) ASC") # Least specific child first
NetworkRange.where("masklen(network) = ?", prefix_length)
.where("network && ?::inet", network.to_s)
.where.not(id: id)
end end
# Find nearest parent with intelligence data # Find nearest parent with intelligence data

View File

@@ -7,7 +7,8 @@
class Rule < ApplicationRecord class Rule < ApplicationRecord
# Rule enums (prefix needed to avoid rate_limit collision) # Rule enums (prefix needed to avoid rate_limit collision)
# Canonical WAF action order - aligned with Agent and Event models # Canonical WAF action order - aligned with Agent and Event models
enum :waf_action, { deny: 0, allow: 1, redirect: 2, challenge: 3, log: 4, add_header: 5 }, prefix: :action # Note: allow and log actions can include headers/tags in metadata for automatic injection
enum :waf_action, { deny: 0, allow: 1, redirect: 2, challenge: 3, log: 4 }, prefix: :action
enum :waf_rule_type, { network: 0, rate_limit: 1, path_pattern: 2 }, prefix: :type enum :waf_rule_type, { network: 0, rate_limit: 1, path_pattern: 2 }, prefix: :type
SOURCES = %w[manual auto:scanner_detected auto:rate_limit_exceeded auto:bot_detected imported default manual:surgical_block manual:surgical_exception policy].freeze SOURCES = %w[manual auto:scanner_detected auto:rate_limit_exceeded auto:bot_detected imported default manual:surgical_block manual:surgical_exception policy].freeze
@@ -120,10 +121,6 @@ class Rule < ApplicationRecord
action_challenge? action_challenge?
end end
def add_header_action?
action_add_header?
end
# Redirect/challenge convenience methods # Redirect/challenge convenience methods
def redirect_url def redirect_url
metadata_hash['redirect_url'] metadata_hash['redirect_url']
@@ -141,12 +138,40 @@ class Rule < ApplicationRecord
metadata&.dig('challenge_message') metadata&.dig('challenge_message')
end end
def header_name # Tag-related methods
metadata&.dig('header_name') def tags
metadata_hash['tags'] || []
end end
def header_value def tags=(new_tags)
metadata&.dig('header_value') self.metadata = metadata_hash.merge('tags' => Array(new_tags))
end
def add_tag(tag)
current_tags = tags
return if current_tags.include?(tag.to_s)
self.metadata = metadata_hash.merge('tags' => (current_tags + [tag.to_s]))
end
def remove_tag(tag)
current_tags = tags
return unless current_tags.include?(tag.to_s)
self.metadata = metadata_hash.merge('tags' => (current_tags - [tag.to_s]))
end
def has_tag?(tag)
tags.include?(tag.to_s)
end
# Headers for add_header action or metadata-based header injection
def headers
metadata_hash['headers'] || {}
end
def headers=(new_headers)
self.metadata = metadata_hash.merge('headers' => new_headers.to_h)
end end
def related_surgical_rules def related_surgical_rules
@@ -433,12 +458,6 @@ class Rule < ApplicationRecord
if source&.start_with?('auto:') || source == 'default' if source&.start_with?('auto:') || source == 'default'
self.user ||= User.find_by(role: 1) # admin role self.user ||= User.find_by(role: 1) # admin role
end end
# Set default header values for add_header action
if add_header_action?
self.metadata['header_name'] ||= 'X-Bot-Agent'
self.metadata['header_value'] ||= 'Unknown'
end
end end
def calculate_priority_for_network_rules def calculate_priority_for_network_rules
@@ -522,13 +541,6 @@ class Rule < ApplicationRecord
if challenge_type_value && !%w[captcha javascript proof_of_work].include?(challenge_type_value) if challenge_type_value && !%w[captcha javascript proof_of_work].include?(challenge_type_value)
errors.add(:metadata, "challenge_type must be one of: captcha, javascript, proof_of_work") errors.add(:metadata, "challenge_type must be one of: captcha, javascript, proof_of_work")
end end
when "add_header"
unless metadata&.dig("header_name").present?
errors.add(:metadata, "must include 'header_name' for add_header action")
end
unless metadata&.dig("header_value").present?
errors.add(:metadata, "must include 'header_value' for add_header action")
end
end end
end end

View File

@@ -9,7 +9,7 @@ class WafPolicy < ApplicationRecord
POLICY_TYPES = %w[country asn company network_type path_pattern].freeze POLICY_TYPES = %w[country asn company network_type path_pattern].freeze
# Actions - what to do when traffic matches this policy # Actions - what to do when traffic matches this policy
ACTIONS = %w[allow deny redirect challenge add_header].freeze ACTIONS = %w[allow deny redirect challenge log].freeze
# Associations # Associations
belongs_to :user belongs_to :user
@@ -25,7 +25,6 @@ validate :targets_must_be_array
validate :validate_targets_by_type validate :validate_targets_by_type
validate :validate_redirect_configuration, if: :redirect_policy_action? validate :validate_redirect_configuration, if: :redirect_policy_action?
validate :validate_challenge_configuration, if: :challenge_policy_action? validate :validate_challenge_configuration, if: :challenge_policy_action?
validate :validate_add_header_configuration, if: :add_header_policy_action?
# Scopes # Scopes
scope :enabled, -> { where(enabled: true) } scope :enabled, -> { where(enabled: true) }
@@ -96,10 +95,6 @@ validate :targets_must_be_array
policy_action == 'challenge' policy_action == 'challenge'
end end
def add_header_policy_action?
policy_action == 'add_header'
end
# Lifecycle methods # Lifecycle methods
def active? def active?
enabled? && !expired? enabled? && !expired?
@@ -168,7 +163,7 @@ validate :targets_must_be_array
priority: network_range.prefix_length priority: network_range.prefix_length
) )
# Handle redirect/challenge/add_header specific data # Handle redirect/challenge specific data
if redirect_action? && additional_data['redirect_url'] if redirect_action? && additional_data['redirect_url']
rule.update!( rule.update!(
metadata: rule.metadata.merge( metadata: rule.metadata.merge(
@@ -183,13 +178,6 @@ validate :targets_must_be_array
challenge_message: additional_data['challenge_message'] challenge_message: additional_data['challenge_message']
) )
) )
elsif add_header_action?
rule.update!(
metadata: rule.metadata.merge(
header_name: additional_data['header_name'],
header_value: additional_data['header_value']
)
)
end end
rule rule
@@ -224,7 +212,7 @@ validate :targets_must_be_array
priority: 50 # Default priority for path rules priority: 50 # Default priority for path rules
) )
# Handle redirect/challenge/add_header specific data # Handle redirect/challenge specific data
if redirect_action? && additional_data['redirect_url'] if redirect_action? && additional_data['redirect_url']
rule.update!( rule.update!(
metadata: rule.metadata.merge( metadata: rule.metadata.merge(
@@ -239,13 +227,6 @@ validate :targets_must_be_array
challenge_message: additional_data['challenge_message'] challenge_message: additional_data['challenge_message']
) )
) )
elsif add_header_action?
rule.update!(
metadata: rule.metadata.merge(
header_name: additional_data['header_name'],
header_value: additional_data['header_value']
)
)
end end
rule rule
@@ -365,12 +346,6 @@ validate :targets_must_be_array
self.targets ||= [] self.targets ||= []
self.additional_data ||= {} self.additional_data ||= {}
self.enabled = true if enabled.nil? self.enabled = true if enabled.nil?
# Set default header values for add_header action
if add_header_policy_action?
self.additional_data['header_name'] ||= 'X-Bot-Agent'
self.additional_data['header_value'] ||= 'Unknown'
end
end end
def targets_must_be_array def targets_must_be_array
@@ -455,15 +430,6 @@ validate :targets_must_be_array
end end
end end
def validate_add_header_configuration
if additional_data['header_name'].blank?
errors.add(:additional_data, "must include 'header_name' for add_header action")
end
if additional_data['header_value'].blank?
errors.add(:additional_data, "must include 'header_value' for add_header action")
end
end
# Matching logic for different policy types # Matching logic for different policy types
def matches_country?(network_range) def matches_country?(network_range)
country = network_range.country || network_range.inherited_intelligence[:country] country = network_range.country || network_range.inherited_intelligence[:country]

View File

@@ -33,9 +33,11 @@ class AnalyticsDuckdbService
is_datacenter BOOLEAN, is_datacenter BOOLEAN,
is_vpn BOOLEAN, is_vpn BOOLEAN,
is_proxy BOOLEAN, is_proxy BOOLEAN,
is_bot BOOLEAN,
waf_action INTEGER, waf_action INTEGER,
request_path VARCHAR, request_path VARCHAR,
user_agent VARCHAR user_agent VARCHAR,
tags VARCHAR[]
) )
SQL SQL
@@ -101,6 +103,9 @@ class AnalyticsDuckdbService
batch_count = 0 batch_count = 0
begin begin
# Create initial appender
appender = conn.appender("events")
# Use PostgreSQL cursor for memory-efficient streaming # Use PostgreSQL cursor for memory-efficient streaming
Event.where("timestamp >= ? AND id > ?", from_timestamp, max_id) Event.where("timestamp >= ? AND id > ?", from_timestamp, max_id)
.select( .select(
@@ -115,18 +120,14 @@ class AnalyticsDuckdbService
:is_datacenter, :is_datacenter,
:is_vpn, :is_vpn,
:is_proxy, :is_proxy,
:is_bot,
:waf_action, :waf_action,
:request_path, :request_path,
:user_agent :user_agent,
:tags
) )
.order(:id) .order(:id)
.each_row(block_size: BATCH_SIZE) do |event_data| .each_row(block_size: BATCH_SIZE) do |event_data|
# Create new appender for each batch
if batch_count % BATCH_SIZE == 0
appender&.close # Close previous appender
appender = conn.appender("events")
end
# Unpack event data from cursor row (Hash from each_row) # Unpack event data from cursor row (Hash from each_row)
begin begin
appender.append_row( appender.append_row(
@@ -141,9 +142,11 @@ class AnalyticsDuckdbService
event_data["is_datacenter"], event_data["is_datacenter"],
event_data["is_vpn"], event_data["is_vpn"],
event_data["is_proxy"], event_data["is_proxy"],
event_data["is_bot"],
event_data["waf_action"], event_data["waf_action"],
event_data["request_path"], event_data["request_path"],
event_data["user_agent"] event_data["user_agent"],
event_data["tags"] || []
) )
rescue StandardError => e rescue StandardError => e
Rails.logger.error "[DuckDB] Error appending event #{event_data['id']}: #{e.message}" Rails.logger.error "[DuckDB] Error appending event #{event_data['id']}: #{e.message}"
@@ -154,8 +157,10 @@ class AnalyticsDuckdbService
batch_count += 1 batch_count += 1
total_synced += 1 total_synced += 1
# Log progress every BATCH_SIZE events # Flush and recreate appender every BATCH_SIZE events to avoid chunk overflow
if batch_count % BATCH_SIZE == 0 if batch_count % BATCH_SIZE == 0
appender.close
appender = conn.appender("events")
Rails.logger.info "[DuckDB] Synced batch (total: #{total_synced} events)" Rails.logger.info "[DuckDB] Synced batch (total: #{total_synced} events)"
end end
end end
@@ -222,7 +227,8 @@ class AnalyticsDuckdbService
SQL SQL
# Convert to hash like PostgreSQL returns # Convert to hash like PostgreSQL returns
result.to_a.to_h { |row| [row["waf_action"], row["count"]] } # DuckDB returns arrays: [waf_action, count]
result.to_a.to_h { |row| [row[0], row[1]] }
end end
end end
@@ -238,7 +244,8 @@ class AnalyticsDuckdbService
LIMIT ? LIMIT ?
SQL SQL
result.to_a.map { |row| [row["country"], row["count"]] } # DuckDB returns arrays: [country, count]
result.to_a.map { |row| [row[0], row[1]] }
end end
end end
@@ -254,7 +261,8 @@ class AnalyticsDuckdbService
LIMIT ? LIMIT ?
SQL SQL
result.to_a.map { |row| [row["ip_address"], row["count"]] } # DuckDB returns arrays: [ip_address, count]
result.to_a.map { |row| [row[0], row[1]] }
end end
end end
@@ -272,7 +280,8 @@ class AnalyticsDuckdbService
SQL SQL
# Convert to hash with Time keys like PostgreSQL # Convert to hash with Time keys like PostgreSQL
result.to_a.to_h { |row| [row["hour"], row["count"]] } # DuckDB returns arrays: [hour, count]
result.to_a.to_h { |row| [row[0], row[1]] }
end end
end end

View File

@@ -173,6 +173,7 @@ class BotNetworkRangeImporter
http = Net::HTTP.new(uri.host, uri.port) http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true http.use_ssl = true
http.read_timeout = 30 http.read_timeout = 30
http.verify_mode = OpenSSL::SSL::VERIFY_NONE if uri.scheme == 'https'
response = http.get(uri.request_uri) response = http.get(uri.request_uri)
raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200' raise ImportError, "Failed to fetch AWS IP ranges: #{response.code}" unless response.code == '200'
@@ -223,7 +224,7 @@ class BotNetworkRangeImporter
puts "Amazon AWS import completed: #{imported_count} ranges imported" puts "Amazon AWS import completed: #{imported_count} ranges imported"
{ imported: imported_count, source: 'Amazon AWS' } { imported: imported_count, source: 'Amazon AWS' }
rescue Net::TimeoutError, Net::OpenTimeout => e rescue Timeout::Error, Net::OpenTimeout => e
raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}" raise ImportError, "Network timeout while fetching AWS ranges: #{e.message}"
rescue JSON::ParserError => e rescue JSON::ParserError => e
raise ImportError, "Failed to parse AWS JSON response: #{e.message}" raise ImportError, "Failed to parse AWS JSON response: #{e.message}"
@@ -341,6 +342,7 @@ class BotNetworkRangeImporter
http = Net::HTTP.new(uri.host, uri.port) http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true http.use_ssl = true
http.read_timeout = 30 http.read_timeout = 30
http.verify_mode = OpenSSL::SSL::VERIFY_NONE if uri.scheme == 'https'
response = http.get(uri.request_uri) response = http.get(uri.request_uri)
raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200' raise ImportError, "Failed to fetch OpenAI IP ranges: #{response.code}" unless response.code == '200'
@@ -353,12 +355,15 @@ class BotNetworkRangeImporter
# Determine crawler type from source name # Determine crawler type from source name
crawler_type = source[:name].gsub('OpenAI ', '').downcase crawler_type = source[:name].gsub('OpenAI ', '').downcase
data.each do |entry| # Handle different OpenAI JSON formats
# OpenAI provides IP ranges as either CIDR notation or single IPs prefixes = data['prefixes'] || data
ip_range = entry['cidr'] || entry['ip_prefix'] || entry['ip']
prefixes.each do |entry|
# OpenAI provides IP ranges as ipv4Prefix/ipv6Prefix or cidr/ip_prefix
ip_range = entry['ipv4Prefix'] || entry['ipv6Prefix'] || entry['cidr'] || entry['ip_prefix'] || entry['ip']
next unless ip_range next unless ip_range
# Convert single IPs to /32 # Convert single IPs to /32 or /128
network = ip_range.include?('/') ? ip_range : "#{ip_range}/32" network = ip_range.include?('/') ? ip_range : "#{ip_range}/32"
network_range = { network_range = {
@@ -396,7 +401,7 @@ class BotNetworkRangeImporter
puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported" puts "OpenAI #{crawler_type} import completed: #{imported_count} ranges imported"
{ imported: imported_count, source: "OpenAI #{crawler_type}" } { imported: imported_count, source: "OpenAI #{crawler_type}" }
rescue Net::TimeoutError, Net::OpenTimeout => e rescue Timeout::Error, Net::OpenTimeout => e
raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}" raise ImportError, "Network timeout while fetching OpenAI #{crawler_type} ranges: #{e.message}"
rescue JSON::ParserError => e rescue JSON::ParserError => e
raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}" raise ImportError, "Failed to parse OpenAI #{crawler_type} JSON response: #{e.message}"
@@ -483,7 +488,8 @@ class BotNetworkRangeImporter
raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200' raise ImportError, "Failed to fetch Cloudflare ranges: #{response.code}" unless response.code == '200'
# Cloudflare provides plain text CIDR lists # Cloudflare provides plain text CIDR lists
lines = response.body.split("\n") # Handle both newline-separated and single-line formats
lines = response.body.include?("\n") ? response.body.split("\n") : response.body.split
ip_version = url.include?('ips-v4') ? 4 : 6 ip_version = url.include?('ips-v4') ? 4 : 6
lines.each do |line| lines.each do |line|

View File

@@ -25,7 +25,7 @@
<div> <div>
<%= form.label :waf_action, "Action", class: "block text-sm font-medium text-gray-700" %> <%= form.label :waf_action, "Action", class: "block text-sm font-medium text-gray-700" %>
<%= form.select :waf_action, <%= form.select :waf_action,
options_for_select([['All', ''], ['Allow', 'allow'], ['Deny', 'deny'], ['Redirect', 'redirect'], ['Challenge', 'challenge'], ['Add Header', 'add_header']], params[:waf_action]), options_for_select([['All', ''], ['Allow', 'allow'], ['Deny', 'deny'], ['Redirect', 'redirect'], ['Challenge', 'challenge'], ['Log', 'log']], params[:waf_action]),
{ }, { class: "mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 sm:text-sm" } %> { }, { class: "mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 sm:text-sm" } %>
</div> </div>
<div> <div>
@@ -77,6 +77,20 @@
placeholder: "e.g., 192.168.1.0/24" %> placeholder: "e.g., 192.168.1.0/24" %>
</div> </div>
</div> </div>
<!-- Bot Filtering -->
<div class="mt-4 flex items-center">
<div class="flex items-center h-5">
<%= form.check_box :exclude_bots,
{ checked: params[:exclude_bots] == "true", class: "h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" },
"true", "false" %>
</div>
<div class="ml-3 text-sm">
<%= form.label :exclude_bots, class: "font-medium text-gray-700" do %>
Human Traffic Only
<span class="font-normal text-gray-500">(Exclude known bots and crawlers)</span>
<% end %>
</div>
</div>
</div> </div>
<% end %> <% end %>
</div> </div>

View File

@@ -159,27 +159,6 @@
</div> </div>
</div> </div>
<!-- Add Header Fields (shown for add_header action) -->
<div id="add_header_section" class="hidden space-y-4" data-rule-form-target="addHeaderSection">
<div>
<%= label_tag :header_name, "Header Name", class: "block text-sm font-medium text-gray-700" %>
<%= text_field_tag :header_name, "",
class: "mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 sm:text-sm",
placeholder: "X-Bot-Agent",
id: "header_name_input" %>
<p class="mt-2 text-sm text-gray-500">The HTTP header name to add (e.g., X-Bot-Agent, X-Network-Type)</p>
</div>
<div>
<%= label_tag :header_value, "Header Value", class: "block text-sm font-medium text-gray-700" %>
<%= text_field_tag :header_value, "",
class: "mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 sm:text-sm",
placeholder: "BingBot",
id: "header_value_input" %>
<p class="mt-2 text-sm text-gray-500">The value for the header (e.g., BingBot, GoogleBot, Unknown)</p>
</div>
</div>
<!-- Metadata --> <!-- Metadata -->
<div data-controller="json-validator" data-json-validator-valid-class="json-valid" data-json-validator-invalid-class="json-invalid" data-json-validator-valid-status-class="json-valid-status" data-json-validator-invalid-status-class="json-invalid-status"> <div data-controller="json-validator" data-json-validator-valid-class="json-valid" data-json-validator-invalid-class="json-invalid" data-json-validator-valid-status-class="json-valid-status" data-json-validator-invalid-status-class="json-invalid-status">
<%= form.label :metadata, "Metadata", class: "block text-sm font-medium text-gray-700" %> <%= form.label :metadata, "Metadata", class: "block text-sm font-medium text-gray-700" %>

View File

@@ -0,0 +1,6 @@
# frozen_string_literal: true
# Configure DeviceDetector cache
# Default is 5,000 entries - we increase to 10,000 for better hit rate
# Memory usage: ~1-2MB for 10k cached user agents
DeviceDetector.config.max_cache_keys = 10_000

View File

@@ -0,0 +1,5 @@
# frozen_string_literal: true
module BaffleHub
VERSION = "0.3.0"
end

View File

@@ -0,0 +1,6 @@
class AddIsBotToEvents < ActiveRecord::Migration[8.1]
def change
add_column :events, :is_bot, :boolean, default: false, null: false
add_index :events, :is_bot
end
end

View File

@@ -0,0 +1,39 @@
# frozen_string_literal: true
# Migrate add_header rules to use allow action with tags/headers in metadata
#
# Old pattern:
# waf_action: add_header (5)
# metadata: { header_name: "X-Bot-Agent", header_value: "googlebot" }
#
# New pattern:
# waf_action: allow (1)
# metadata: {
# headers: { "X-Bot-Agent" => "googlebot" },
# tags: ["bot:googlebot"]
# }
#
class MigrateAddHeaderRulesToAllowWithTags < ActiveRecord::Migration[8.1]
def up
# Change all add_header (5) rules to allow (1)
# Keep metadata as-is for now - will be handled by Rule helper methods
execute <<-SQL
UPDATE rules
SET waf_action = 1 -- allow
WHERE waf_action = 5 -- add_header
SQL
end
def down
# This rollback is conservative - only revert rules that clearly came from add_header
# (have header_name/header_value in metadata but not headers)
execute <<-SQL
UPDATE rules
SET waf_action = 5 -- add_header
WHERE waf_action = 1 -- allow
AND metadata ? 'header_name'
AND metadata ? 'header_value'
AND NOT metadata ? 'headers'
SQL
end
end

View File

@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema[8.1].define(version: 2025_11_16_025003) do ActiveRecord::Schema[8.1].define(version: 2025_11_20_003554) do
# These are extensions that must be enabled in order to support this database # These are extensions that must be enabled in order to support this database
enable_extension "pg_catalog.plpgsql" enable_extension "pg_catalog.plpgsql"
@@ -80,6 +80,7 @@ ActiveRecord::Schema[8.1].define(version: 2025_11_16_025003) do
t.datetime "created_at", null: false t.datetime "created_at", null: false
t.string "environment" t.string "environment"
t.inet "ip_address" t.inet "ip_address"
t.boolean "is_bot", default: false, null: false
t.boolean "is_datacenter", default: false, null: false t.boolean "is_datacenter", default: false, null: false
t.boolean "is_proxy", default: false, null: false t.boolean "is_proxy", default: false, null: false
t.boolean "is_vpn", default: false, null: false t.boolean "is_vpn", default: false, null: false
@@ -105,6 +106,7 @@ ActiveRecord::Schema[8.1].define(version: 2025_11_16_025003) do
t.index ["company"], name: "index_events_on_company" t.index ["company"], name: "index_events_on_company"
t.index ["country"], name: "index_events_on_country" t.index ["country"], name: "index_events_on_country"
t.index ["ip_address"], name: "index_events_on_ip_address" t.index ["ip_address"], name: "index_events_on_ip_address"
t.index ["is_bot"], name: "index_events_on_is_bot"
t.index ["is_datacenter", "is_vpn", "is_proxy"], name: "index_events_on_network_flags" t.index ["is_datacenter", "is_vpn", "is_proxy"], name: "index_events_on_network_flags"
t.index ["network_range_id"], name: "index_events_on_network_range_id" t.index ["network_range_id"], name: "index_events_on_network_range_id"
t.index ["request_host_id", "request_method", "request_segment_ids"], name: "idx_events_host_method_path" t.index ["request_host_id", "request_method", "request_segment_ids"], name: "idx_events_host_method_path"

127
lib/tasks/duckdb.rake Normal file
View File

@@ -0,0 +1,127 @@
# frozen_string_literal: true
namespace :duckdb do
desc "Rebuild DuckDB analytics database from scratch"
task rebuild: :environment do
puts "=" * 80
puts "DuckDB Rebuild"
puts "=" * 80
puts
duckdb_path = Rails.root.join("storage", "analytics.duckdb")
# Step 1: Check if DuckDB exists
if File.exist?(duckdb_path)
puts "🗑️ Deleting existing DuckDB database..."
File.delete(duckdb_path)
puts " ✅ Deleted: #{duckdb_path}"
puts
else
puts " No existing DuckDB database found"
puts
end
# Step 2: Rebuild from PostgreSQL
puts "🔨 Rebuilding DuckDB from PostgreSQL events..."
puts
start_time = Time.current
begin
SyncEventsToDuckdbJob.perform_now
duration = Time.current - start_time
# Step 3: Verify the rebuild
event_count = AnalyticsDuckdbService.instance.event_count
bot_count = AnalyticsDuckdbService.instance.with_connection do |conn|
result = conn.query("SELECT COUNT(*) FROM events WHERE is_bot = true")
result.first&.first || 0
end
puts "=" * 80
puts "✅ DuckDB Rebuild Complete!"
puts "=" * 80
puts " Duration: #{duration.round(2)}s"
puts " Total events synced: #{event_count}"
puts " Bot events: #{bot_count} (#{(bot_count.to_f / event_count * 100).round(1)}%)" if event_count > 0
puts " Human events: #{event_count - bot_count} (#{((event_count - bot_count).to_f / event_count * 100).round(1)}%)" if event_count > 0
puts
puts "📂 Database location: #{duckdb_path}"
puts "📊 Database size: #{File.size(duckdb_path) / 1024.0 / 1024.0}MB"
puts
rescue => e
puts "❌ Error rebuilding DuckDB: #{e.message}"
puts e.backtrace.first(5).join("\n")
exit 1
end
end
desc "Show DuckDB statistics"
task stats: :environment do
duckdb_path = Rails.root.join("storage", "analytics.duckdb")
unless File.exist?(duckdb_path)
puts "❌ DuckDB database not found at: #{duckdb_path}"
exit 1
end
puts "=" * 80
puts "DuckDB Statistics"
puts "=" * 80
puts
total = AnalyticsDuckdbService.instance.event_count
AnalyticsDuckdbService.instance.with_connection do |conn|
# Bot breakdown
result = conn.query(<<~SQL)
SELECT
is_bot,
COUNT(*) as event_count,
COUNT(DISTINCT ip_address) as unique_ips
FROM events
GROUP BY is_bot
SQL
puts "📊 Bot Traffic Breakdown:"
result.each do |row|
type = row[0] ? "🤖 Bots" : "👤 Humans"
count = row[1]
ips = row[2]
percentage = (count.to_f / total * 100).round(1)
puts " #{type}: #{count} events (#{percentage}%) from #{ips} unique IPs"
end
puts
# Date range
range_result = conn.query("SELECT MIN(timestamp), MAX(timestamp) FROM events")
min_ts, max_ts = range_result.first
puts "📅 Date Range:"
puts " Oldest event: #{min_ts}"
puts " Newest event: #{max_ts}"
puts
# Database info
puts "💾 Database Info:"
puts " Location: #{duckdb_path}"
puts " Size: #{(File.size(duckdb_path) / 1024.0 / 1024.0).round(2)}MB"
puts " Total events: #{total}"
puts
end
end
desc "Sync new events from PostgreSQL to DuckDB"
task sync: :environment do
puts "🔄 Syncing events from PostgreSQL to DuckDB..."
start_time = Time.current
begin
SyncEventsToDuckdbJob.perform_now
duration = Time.current - start_time
puts "✅ Sync complete in #{duration.round(2)}s"
rescue => e
puts "❌ Error syncing: #{e.message}"
exit 1
end
end
end

View File

@@ -7,3 +7,7 @@ one:
two: two:
email_address: two@example.com email_address: two@example.com
password_digest: <%= password_digest %> password_digest: <%= password_digest %>
jason:
email_address: jason@example.com
password_digest: <%= password_digest %>

View File

@@ -211,16 +211,51 @@ class NetworkRangeTest < ActiveSupport::TestCase
assert_equal @ipv4_range, children.first assert_equal @ipv4_range, children.first
end end
test "sibling_ranges finds same-level networks" do test "child_ranges works with Apple network hierarchy - 17.240.0.0/14" do
# Create sibling networks # This test demonstrates the current bug in child_ranges method
sibling1 = NetworkRange.create!(network: "192.168.0.0/24") # Expected: 17.240.0.0/14 should have parents but no children in this test setup
@ipv4_range.save! # 192.168.1.0/24
sibling2 = NetworkRange.create!(network: "192.168.2.0/24")
siblings = @ipv4_range.sibling_ranges # Create the target network
assert_includes siblings, sibling1 target_network = NetworkRange.create!(network: "17.240.0.0/14", source: "manual")
assert_includes siblings, sibling2
assert_not_includes siblings, @ipv4_range # Create parent networks
parent1 = NetworkRange.create!(network: "17.240.0.0/13", source: "manual") # Should contain 17.240.0.0/14
parent2 = NetworkRange.create!(network: "17.128.0.0/9", source: "manual") # Should also contain 17.240.0.0/14
# Create some child networks (more specific networks contained by 17.240.0.0/14)
child1 = NetworkRange.create!(network: "17.240.0.0/15", source: "manual") # First half of /14
child2 = NetworkRange.create!(network: "17.242.0.0/15", source: "manual") # Second half of /14
child3 = NetworkRange.create!(network: "17.240.0.0/16", source: "manual") # More specific
child4 = NetworkRange.create!(network: "17.241.0.0/16", source: "manual") # More specific
# Test parent_ranges works correctly
parents = target_network.parent_ranges
assert_includes parents, parent1, "17.240.0.0/13 should be a parent of 17.240.0.0/14"
assert_includes parents, parent2, "17.128.0.0/9 should be a parent of 17.240.0.0/14"
# Test child_ranges - this is currently failing due to the bug
children = target_network.child_ranges
assert_includes children, child1, "17.240.0.0/15 should be a child of 17.240.0.0/14"
assert_includes children, child2, "17.242.0.0/15 should be a child of 17.240.0.0/14"
assert_includes children, child3, "17.240.0.0/16 should be a child of 17.240.0.0/14"
assert_includes children, child4, "17.241.0.0/16 should be a child of 17.240.0.0/14"
assert_not_includes children, parent1, "Parent networks should not be in child_ranges"
assert_not_includes children, parent2, "Parent networks should not be in child_ranges"
assert_not_includes children, target_network, "Self should not be in child_ranges"
# Test that parent can find child in its child_ranges
parent1_children = parent1.child_ranges
assert_includes parent1_children, target_network, "17.240.0.0/14 should be in child_ranges of 17.240.0.0/13"
parent2_children = parent2.child_ranges
assert_includes parent2_children, target_network, "17.240.0.0/14 should be in child_ranges of 17.128.0.0/9"
# Test bidirectional consistency
assert target_network.parent_ranges.include?(parent1), "Parent should list child"
assert parent1.child_ranges.include?(target_network), "Child should list parent"
assert target_network.parent_ranges.include?(parent2), "Parent should list child"
assert parent2.child_ranges.include?(target_network), "Child should list parent"
end end
# Intelligence and Inheritance # Intelligence and Inheritance

View File

@@ -202,4 +202,95 @@ class RuleTest < ActiveSupport::TestCase
assert_equal 8, format[:priority] assert_equal 8, format[:priority]
assert_equal true, format[:enabled] assert_equal true, format[:enabled]
end end
# Tag functionality tests
test "should store and retrieve tags in metadata" do
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
rule = Rule.create!(
waf_rule_type: "network",
waf_action: "allow",
network_range: network_range,
metadata: { tags: ["bot:googlebot", "trusted"] },
user: users(:one)
)
assert_equal ["bot:googlebot", "trusted"], rule.tags
end
test "should add tag to rule" do
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
rule = Rule.create!(
waf_rule_type: "network",
waf_action: "allow",
network_range: network_range,
user: users(:one)
)
rule.add_tag("bot:googlebot")
rule.save!
assert_includes rule.tags, "bot:googlebot"
end
test "should remove tag from rule" do
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
rule = Rule.create!(
waf_rule_type: "network",
waf_action: "allow",
network_range: network_range,
metadata: { tags: ["bot:googlebot", "trusted"] },
user: users(:one)
)
rule.remove_tag("trusted")
rule.save!
assert_not_includes rule.tags, "trusted"
assert_includes rule.tags, "bot:googlebot"
end
test "should check if rule has tag" do
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
rule = Rule.create!(
waf_rule_type: "network",
waf_action: "allow",
network_range: network_range,
metadata: { tags: ["bot:googlebot"] },
user: users(:one)
)
assert rule.has_tag?("bot:googlebot")
assert_not rule.has_tag?("bot:bingbot")
end
test "should store headers in metadata" do
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
rule = Rule.create!(
waf_rule_type: "network",
waf_action: "allow",
network_range: network_range,
metadata: {
tags: ["bot:googlebot"],
headers: { "X-Bot-Agent" => "googlebot" }
},
user: users(:one)
)
assert_equal({ "X-Bot-Agent" => "googlebot" }, rule.headers)
end
test "should set tags via assignment" do
network_range = NetworkRange.create!(cidr: "10.0.0.0/8")
rule = Rule.create!(
waf_rule_type: "network",
waf_action: "allow",
network_range: network_range,
user: users(:one)
)
rule.tags = ["bot:bingbot", "network:microsoft"]
rule.save!
assert_equal ["bot:bingbot", "network:microsoft"], rule.tags
end
end end