895 lines
28 KiB
Ruby
895 lines
28 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require 'ostruct'
|
|
|
|
# EventDdb - DuckDB-backed analytics queries for events
|
|
# Provides an ActiveRecord-like interface for querying DuckDB events table
|
|
# Falls back to PostgreSQL Event model if DuckDB is unavailable
|
|
class EventDdb
|
|
# Enum mappings from integer to string (matching Event model)
|
|
ACTION_MAP = {
|
|
0 => "deny",
|
|
1 => "allow",
|
|
2 => "redirect",
|
|
3 => "challenge",
|
|
4 => "log"
|
|
}.freeze
|
|
|
|
METHOD_MAP = {
|
|
0 => "get",
|
|
1 => "post",
|
|
2 => "put",
|
|
3 => "patch",
|
|
4 => "delete",
|
|
5 => "head",
|
|
6 => "options"
|
|
}.freeze
|
|
|
|
class << self
|
|
# Get DuckDB service
|
|
def service
|
|
AnalyticsDuckdbService.instance
|
|
end
|
|
|
|
# Helper to load parquet files into in-memory events view
|
|
# This allows all existing queries to work without modification
|
|
# Uses glob pattern to read all parquet files (excluding .temp files)
|
|
def with_events_from_parquet(&block)
|
|
service.with_connection do |conn|
|
|
# Create events view from all parquet files using glob pattern
|
|
# Pattern matches: minute/*.parquet, hours/*.parquet, days/*.parquet, weeks/*.parquet
|
|
# Excludes .temp files automatically (they don't match *.parquet)
|
|
parquet_pattern = "#{AnalyticsDuckdbService::PARQUET_BASE_PATH}/**/*.parquet"
|
|
|
|
conn.execute(<<~SQL)
|
|
CREATE VIEW events AS
|
|
SELECT * FROM read_parquet('#{parquet_pattern}')
|
|
SQL
|
|
|
|
yield conn
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error loading parquet files: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Total events since timestamp
|
|
def count_since(start_time)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query("SELECT COUNT(*) as count FROM events WHERE timestamp >= ?", start_time)
|
|
result.first&.first || 0
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in count_since: #{e.message}"
|
|
nil # Fallback to PostgreSQL
|
|
end
|
|
|
|
# Event breakdown by WAF action
|
|
def breakdown_by_action(start_time)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time)
|
|
SELECT waf_action, COUNT(*) as count
|
|
FROM events
|
|
WHERE timestamp >= ?
|
|
GROUP BY waf_action
|
|
SQL
|
|
|
|
# Convert to hash like ActiveRecord .group.count returns
|
|
# DuckDB returns integer enum values, map to string names
|
|
# 0=deny, 1=allow, 2=redirect, 3=challenge, 4=log
|
|
action_map = { 0 => "deny", 1 => "allow", 2 => "redirect", 3 => "challenge", 4 => "log" }
|
|
result.to_a.to_h { |row| [action_map[row[0]] || "unknown", row[1]] }
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in breakdown_by_action: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Top countries with event counts
|
|
def top_countries(start_time, limit = 10)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time, limit)
|
|
SELECT country, COUNT(*) as count
|
|
FROM events
|
|
WHERE timestamp >= ? AND country IS NOT NULL
|
|
GROUP BY country
|
|
ORDER BY count DESC
|
|
LIMIT ?
|
|
SQL
|
|
|
|
# Return array of [country, count] tuples like ActiveRecord
|
|
# DuckDB returns arrays: [country, count]
|
|
result.to_a.map { |row| [row[0], row[1]] }
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in top_countries: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Top blocked IPs
|
|
def top_blocked_ips(start_time, limit = 10)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time, limit)
|
|
SELECT ip_address, COUNT(*) as count
|
|
FROM events
|
|
WHERE timestamp >= ? AND waf_action = 0
|
|
GROUP BY ip_address
|
|
ORDER BY count DESC
|
|
LIMIT ?
|
|
SQL
|
|
|
|
# DuckDB returns arrays: [ip_address, count]
|
|
result.to_a.map { |row| [row[0], row[1]] }
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in top_blocked_ips: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Hourly timeline aggregation
|
|
def hourly_timeline(start_time, end_time)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time, end_time)
|
|
SELECT
|
|
DATE_TRUNC('hour', timestamp) as hour,
|
|
COUNT(*) as count
|
|
FROM events
|
|
WHERE timestamp >= ? AND timestamp < ?
|
|
GROUP BY hour
|
|
ORDER BY hour
|
|
SQL
|
|
|
|
# Convert to hash with Time keys like ActiveRecord
|
|
# DuckDB returns arrays: [hour, count]
|
|
result.to_a.to_h { |row| [row[0], row[1]] }
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in hourly_timeline: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Top networks by traffic volume
|
|
# Returns array of arrays: [network_range_id, event_count, unique_ips]
|
|
def top_networks(start_time, limit = 50)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time, limit)
|
|
SELECT
|
|
network_range_id,
|
|
COUNT(*) as event_count,
|
|
COUNT(DISTINCT ip_address) as unique_ips
|
|
FROM events
|
|
WHERE timestamp >= ? AND network_range_id IS NOT NULL
|
|
GROUP BY network_range_id
|
|
ORDER BY event_count DESC
|
|
LIMIT ?
|
|
SQL
|
|
|
|
result.to_a
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in top_networks: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Top companies
|
|
# Returns array of OpenStruct objects with: company, event_count, unique_ips, network_count
|
|
def top_companies(start_time, limit = 20)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time, limit)
|
|
SELECT
|
|
company,
|
|
COUNT(*) as event_count,
|
|
COUNT(DISTINCT ip_address) as unique_ips,
|
|
COUNT(DISTINCT network_range_id) as network_count
|
|
FROM events
|
|
WHERE timestamp >= ? AND company IS NOT NULL
|
|
GROUP BY company
|
|
ORDER BY event_count DESC
|
|
LIMIT ?
|
|
SQL
|
|
|
|
# Convert arrays to OpenStruct for attribute access
|
|
result.to_a.map do |row|
|
|
OpenStruct.new(
|
|
company: row[0],
|
|
event_count: row[1],
|
|
unique_ips: row[2],
|
|
network_count: row[3]
|
|
)
|
|
end
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in top_companies: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Top ASNs
|
|
# Returns array of OpenStruct objects with: asn, asn_org, event_count, unique_ips, network_count
|
|
def top_asns(start_time, limit = 15)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time, limit)
|
|
SELECT
|
|
asn,
|
|
asn_org,
|
|
COUNT(*) as event_count,
|
|
COUNT(DISTINCT ip_address) as unique_ips,
|
|
COUNT(DISTINCT network_range_id) as network_count
|
|
FROM events
|
|
WHERE timestamp >= ? AND asn IS NOT NULL
|
|
GROUP BY asn, asn_org
|
|
ORDER BY event_count DESC
|
|
LIMIT ?
|
|
SQL
|
|
|
|
# Convert arrays to OpenStruct for attribute access
|
|
result.to_a.map do |row|
|
|
OpenStruct.new(
|
|
asn: row[0],
|
|
asn_org: row[1],
|
|
event_count: row[2],
|
|
unique_ips: row[3],
|
|
network_count: row[4]
|
|
)
|
|
end
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in top_asns: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Network type breakdown (datacenter, VPN, proxy, standard)
|
|
# Returns hash with network_type as key and hash of stats as value
|
|
def network_type_breakdown(start_time)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time)
|
|
SELECT
|
|
CASE
|
|
WHEN is_datacenter THEN 'datacenter'
|
|
WHEN is_vpn THEN 'vpn'
|
|
WHEN is_proxy THEN 'proxy'
|
|
ELSE 'standard'
|
|
END as network_type,
|
|
COUNT(*) as event_count,
|
|
COUNT(DISTINCT ip_address) as unique_ips,
|
|
COUNT(DISTINCT network_range_id) as network_count
|
|
FROM events
|
|
WHERE timestamp >= ?
|
|
GROUP BY network_type
|
|
SQL
|
|
|
|
# Convert arrays to hash: network_type => { event_count, unique_ips, network_count }
|
|
result.to_a.to_h do |row|
|
|
[
|
|
row[0], # network_type
|
|
{
|
|
"event_count" => row[1],
|
|
"unique_ips" => row[2],
|
|
"network_count" => row[3]
|
|
}
|
|
]
|
|
end
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in network_type_breakdown: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Top countries with detailed stats (event count and unique IPs)
|
|
# Returns array of OpenStruct objects with: country, event_count, unique_ips
|
|
def top_countries_with_stats(start_time, limit = 15)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time, limit)
|
|
SELECT
|
|
country,
|
|
COUNT(*) as event_count,
|
|
COUNT(DISTINCT ip_address) as unique_ips
|
|
FROM events
|
|
WHERE timestamp >= ? AND country IS NOT NULL
|
|
GROUP BY country
|
|
ORDER BY event_count DESC
|
|
LIMIT ?
|
|
SQL
|
|
|
|
# Convert arrays to OpenStruct for attribute access
|
|
result.to_a.map do |row|
|
|
OpenStruct.new(
|
|
country: row[0],
|
|
event_count: row[1],
|
|
unique_ips: row[2]
|
|
)
|
|
end
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in top_countries_with_stats: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Network type stats with formatted output matching controller expectations
|
|
# Returns hash with type keys containing label, networks, events, unique_ips, percentage
|
|
def network_type_stats(start_time)
|
|
with_events_from_parquet do |conn|
|
|
# Get total events for percentage calculation
|
|
total_result = conn.query("SELECT COUNT(*) as total FROM events WHERE timestamp >= ?", start_time)
|
|
total_events = total_result.first&.first || 0
|
|
|
|
# Get breakdown by network type
|
|
breakdown = network_type_breakdown(start_time)
|
|
return nil unless breakdown
|
|
|
|
# Format results with labels and percentages
|
|
results = {}
|
|
|
|
{
|
|
'datacenter' => 'Datacenter',
|
|
'vpn' => 'VPN',
|
|
'proxy' => 'Proxy',
|
|
'standard' => 'Standard'
|
|
}.each do |type, label|
|
|
stats = breakdown[type]
|
|
event_count = stats ? stats["event_count"] : 0
|
|
|
|
results[type] = {
|
|
label: label,
|
|
networks: stats ? stats["network_count"] : 0,
|
|
events: event_count,
|
|
unique_ips: stats ? stats["unique_ips"] : 0,
|
|
percentage: total_events > 0 ? ((event_count.to_f / total_events) * 100).round(1) : 0
|
|
}
|
|
end
|
|
|
|
results
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in network_type_stats: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Network range traffic statistics
|
|
# Returns comprehensive stats for a given network range ID(s)
|
|
def network_traffic_stats(network_range_ids)
|
|
network_range_ids = Array(network_range_ids)
|
|
return nil if network_range_ids.empty?
|
|
|
|
with_events_from_parquet do |conn|
|
|
# Build IN clause with placeholders
|
|
placeholders = network_range_ids.map { "?" }.join(", ")
|
|
|
|
# Get all stats in a single query
|
|
result = conn.query(<<~SQL, *network_range_ids)
|
|
SELECT
|
|
COUNT(*) as total_requests,
|
|
COUNT(DISTINCT ip_address) as unique_ips,
|
|
SUM(CASE WHEN waf_action = 0 THEN 1 ELSE 0 END) as blocked_requests,
|
|
SUM(CASE WHEN waf_action = 1 THEN 1 ELSE 0 END) as allowed_requests
|
|
FROM events
|
|
WHERE network_range_id IN (#{placeholders})
|
|
SQL
|
|
|
|
stats_row = result.first
|
|
return nil unless stats_row
|
|
|
|
{
|
|
total_requests: stats_row[0] || 0,
|
|
unique_ips: stats_row[1] || 0,
|
|
blocked_requests: stats_row[2] || 0,
|
|
allowed_requests: stats_row[3] || 0
|
|
}
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in network_traffic_stats: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Top paths for network range(s)
|
|
def network_top_paths(network_range_ids, limit = 10)
|
|
network_range_ids = Array(network_range_ids)
|
|
return nil if network_range_ids.empty?
|
|
|
|
with_events_from_parquet do |conn|
|
|
# Build IN clause with placeholders
|
|
placeholders = network_range_ids.map { "?" }.join(", ")
|
|
|
|
result = conn.query(<<~SQL, *network_range_ids, limit)
|
|
SELECT
|
|
request_path,
|
|
COUNT(*) as count
|
|
FROM events
|
|
WHERE network_range_id IN (#{placeholders})
|
|
AND request_path IS NOT NULL
|
|
GROUP BY request_path
|
|
ORDER BY count DESC
|
|
LIMIT ?
|
|
SQL
|
|
|
|
result.to_a.map { |row| [row[0], row[1]] }
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in network_top_paths: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Top user agents for network range(s)
|
|
def network_top_user_agents(network_range_ids, limit = 5)
|
|
network_range_ids = Array(network_range_ids)
|
|
return nil if network_range_ids.empty?
|
|
|
|
with_events_from_parquet do |conn|
|
|
# Build IN clause with placeholders
|
|
placeholders = network_range_ids.map { "?" }.join(", ")
|
|
|
|
result = conn.query(<<~SQL, *network_range_ids, limit)
|
|
SELECT
|
|
user_agent,
|
|
COUNT(*) as count
|
|
FROM events
|
|
WHERE network_range_id IN (#{placeholders})
|
|
AND user_agent IS NOT NULL
|
|
GROUP BY user_agent
|
|
ORDER BY count DESC
|
|
LIMIT ?
|
|
SQL
|
|
|
|
result.to_a.map { |row| [row[0], row[1]] }
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in network_top_user_agents: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Count events for network range(s)
|
|
# Returns integer count of all events in the network
|
|
def network_event_count(network_range_ids)
|
|
network_range_ids = Array(network_range_ids)
|
|
return nil if network_range_ids.empty?
|
|
|
|
with_events_from_parquet do |conn|
|
|
# Build IN clause with placeholders
|
|
placeholders = network_range_ids.map { "?" }.join(", ")
|
|
|
|
result = conn.query(<<~SQL, *network_range_ids)
|
|
SELECT COUNT(*) as count
|
|
FROM events
|
|
WHERE network_range_id IN (#{placeholders})
|
|
SQL
|
|
|
|
result.first&.first || 0
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in network_event_count: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Full user agent tally for network range(s)
|
|
# Returns hash of user_agent => count for all agents in the network
|
|
def network_agent_tally(network_range_ids)
|
|
network_range_ids = Array(network_range_ids)
|
|
return nil if network_range_ids.empty?
|
|
|
|
with_events_from_parquet do |conn|
|
|
# Build IN clause with placeholders
|
|
placeholders = network_range_ids.map { "?" }.join(", ")
|
|
|
|
result = conn.query(<<~SQL, *network_range_ids)
|
|
SELECT
|
|
user_agent,
|
|
COUNT(*) as count
|
|
FROM events
|
|
WHERE network_range_id IN (#{placeholders})
|
|
AND user_agent IS NOT NULL
|
|
GROUP BY user_agent
|
|
SQL
|
|
|
|
# Convert to hash matching Ruby .tally format
|
|
result.to_a.to_h { |row| [row[0], row[1]] }
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in network_agent_tally: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Suspicious network activity patterns
|
|
# Detects high-volume networks, high deny rates, and distributed companies
|
|
def suspicious_patterns(start_time)
|
|
with_events_from_parquet do |conn|
|
|
# High volume networks (5x average)
|
|
avg_query = conn.query(<<~SQL, start_time)
|
|
SELECT
|
|
AVG(event_count) as avg_events
|
|
FROM (
|
|
SELECT network_range_id, COUNT(*) as event_count
|
|
FROM events
|
|
WHERE timestamp >= ? AND network_range_id IS NOT NULL
|
|
GROUP BY network_range_id
|
|
) network_stats
|
|
SQL
|
|
|
|
avg_events = avg_query.first&.first || 0
|
|
threshold = avg_events * 5
|
|
|
|
high_volume = conn.query(<<~SQL, start_time, threshold)
|
|
SELECT
|
|
network_range_id,
|
|
COUNT(*) as event_count
|
|
FROM events
|
|
WHERE timestamp >= ? AND network_range_id IS NOT NULL
|
|
GROUP BY network_range_id
|
|
HAVING COUNT(*) > ?
|
|
ORDER BY event_count DESC
|
|
SQL
|
|
|
|
# High deny rate networks (>50% blocked, min 10 requests)
|
|
high_deny = conn.query(<<~SQL, start_time)
|
|
SELECT
|
|
network_range_id,
|
|
SUM(CASE WHEN waf_action = 0 THEN 1 ELSE 0 END) as denied_count,
|
|
COUNT(*) as total_count
|
|
FROM events
|
|
WHERE timestamp >= ? AND network_range_id IS NOT NULL
|
|
GROUP BY network_range_id
|
|
HAVING CAST(SUM(CASE WHEN waf_action = 0 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*) > 0.5
|
|
AND COUNT(*) >= 10
|
|
ORDER BY denied_count DESC
|
|
SQL
|
|
|
|
# Distributed companies (appearing with 5+ unique IPs)
|
|
distributed_companies = conn.query(<<~SQL, start_time)
|
|
SELECT
|
|
company,
|
|
COUNT(DISTINCT ip_address) as ip_count
|
|
FROM events
|
|
WHERE timestamp >= ? AND company IS NOT NULL
|
|
GROUP BY company
|
|
HAVING COUNT(DISTINCT ip_address) > 5
|
|
ORDER BY ip_count DESC
|
|
LIMIT 10
|
|
SQL
|
|
|
|
{
|
|
high_volume: {
|
|
count: high_volume.to_a.length,
|
|
networks: high_volume.to_a.map { |row| row[0] } # network_range_id
|
|
},
|
|
high_deny_rate: {
|
|
count: high_deny.to_a.length,
|
|
network_ids: high_deny.to_a.map { |row| row[0] } # network_range_id
|
|
},
|
|
distributed_companies: distributed_companies.to_a.map { |row|
|
|
{
|
|
company: row[0], # company name
|
|
subnets: row[1] # ip_count
|
|
}
|
|
}
|
|
}
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in suspicious_patterns: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Bot traffic analysis - breakdown of bot vs human traffic
|
|
def bot_traffic_breakdown(start_time)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time)
|
|
SELECT
|
|
is_bot,
|
|
COUNT(*) as event_count,
|
|
COUNT(DISTINCT ip_address) as unique_ips
|
|
FROM events
|
|
WHERE timestamp >= ?
|
|
GROUP BY is_bot
|
|
SQL
|
|
|
|
# Convert to hash: is_bot => { event_count, unique_ips }
|
|
# DuckDB returns arrays: [is_bot, event_count, unique_ips]
|
|
result.to_a.to_h do |row|
|
|
[
|
|
row[0] ? "bot" : "human", # row[0] = is_bot
|
|
{
|
|
"event_count" => row[1], # row[1] = event_count
|
|
"unique_ips" => row[2] # row[2] = unique_ips
|
|
}
|
|
]
|
|
end
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in bot_traffic_breakdown: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Count human traffic (non-bot) since timestamp
|
|
def human_traffic_count(start_time)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time)
|
|
SELECT COUNT(*) as count
|
|
FROM events
|
|
WHERE timestamp >= ? AND is_bot = false
|
|
SQL
|
|
|
|
result.first&.first || 0
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in human_traffic_count: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Count bot traffic since timestamp
|
|
def bot_traffic_count(start_time)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time)
|
|
SELECT COUNT(*) as count
|
|
FROM events
|
|
WHERE timestamp >= ? AND is_bot = true
|
|
SQL
|
|
|
|
result.first&.first || 0
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in bot_traffic_count: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Top bot user agents
|
|
def top_bot_user_agents(start_time, limit = 20)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time, limit)
|
|
SELECT
|
|
user_agent,
|
|
COUNT(*) as event_count,
|
|
COUNT(DISTINCT ip_address) as unique_ips
|
|
FROM events
|
|
WHERE timestamp >= ? AND is_bot = true AND user_agent IS NOT NULL
|
|
GROUP BY user_agent
|
|
ORDER BY event_count DESC
|
|
LIMIT ?
|
|
SQL
|
|
|
|
# DuckDB returns arrays: [user_agent, event_count, unique_ips]
|
|
result.to_a.map do |row|
|
|
{
|
|
user_agent: row[0], # row[0] = user_agent
|
|
event_count: row[1], # row[1] = event_count
|
|
unique_ips: row[2] # row[2] = unique_ips
|
|
}
|
|
end
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in top_bot_user_agents: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Bot traffic timeline (hourly breakdown)
|
|
def bot_traffic_timeline(start_time, end_time)
|
|
with_events_from_parquet do |conn|
|
|
result = conn.query(<<~SQL, start_time, end_time)
|
|
SELECT
|
|
DATE_TRUNC('hour', timestamp) as hour,
|
|
SUM(CASE WHEN is_bot = true THEN 1 ELSE 0 END) as bot_count,
|
|
SUM(CASE WHEN is_bot = false THEN 1 ELSE 0 END) as human_count
|
|
FROM events
|
|
WHERE timestamp >= ? AND timestamp < ?
|
|
GROUP BY hour
|
|
ORDER BY hour
|
|
SQL
|
|
|
|
# Convert to hash with Time keys
|
|
# DuckDB returns arrays: [hour, bot_count, human_count]
|
|
result.to_a.to_h do |row|
|
|
[
|
|
row[0], # row[0] = hour
|
|
{
|
|
"bot_count" => row[1], # row[1] = bot_count
|
|
"human_count" => row[2], # row[2] = human_count
|
|
"total" => row[1] + row[2]
|
|
}
|
|
]
|
|
end
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in bot_traffic_timeline: #{e.message}"
|
|
nil
|
|
end
|
|
|
|
# Search events with filters and pagination
|
|
# Returns { total_count:, events:[], page:, per_page: }
|
|
# Supports filters: ip, waf_action, country, rule_id, company, asn, network_type, network_range_id, exclude_bots
|
|
def search(filters = {}, page: 1, per_page: 50)
|
|
# Get list of Parquet files to query
|
|
parquet_files = service.parquet_files_for_range(1.year.ago, Time.current)
|
|
|
|
if parquet_files.empty?
|
|
Rails.logger.warn "[EventDdb] No Parquet files found, falling back to DuckDB"
|
|
return search_duckdb(filters, page, per_page)
|
|
end
|
|
|
|
# Query Parquet files using in-memory DuckDB (no file locks!)
|
|
service.with_parquet_connection do |conn|
|
|
# Build WHERE clause
|
|
where_clause, params = build_where_clause(filters)
|
|
|
|
# Build file list for read_parquet
|
|
file_list = parquet_files.map { |f| "'#{f}'" }.join(", ")
|
|
|
|
# Get total count
|
|
count_sql = "SELECT COUNT(*) FROM read_parquet([#{file_list}])#{where_clause}"
|
|
count_result = conn.query(count_sql, *params)
|
|
total_count = count_result.first&.first || 0
|
|
|
|
# Get paginated results
|
|
offset = (page - 1) * per_page
|
|
|
|
data_sql = <<~SQL
|
|
SELECT
|
|
id, timestamp, ip_address, network_range_id, country, company,
|
|
asn, asn_org, is_datacenter, is_vpn, is_proxy, is_bot,
|
|
waf_action, request_method, response_status, rule_id,
|
|
request_path, user_agent, tags
|
|
FROM read_parquet([#{file_list}])
|
|
#{where_clause}
|
|
ORDER BY timestamp DESC
|
|
LIMIT ? OFFSET ?
|
|
SQL
|
|
|
|
result = conn.query(data_sql, *params, per_page, offset)
|
|
|
|
# Convert rows to event-like objects
|
|
events = result.to_a.map { |row| row_to_event(row) }
|
|
|
|
{
|
|
total_count: total_count,
|
|
events: events,
|
|
page: page,
|
|
per_page: per_page
|
|
}
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in Parquet search: #{e.message}"
|
|
Rails.logger.error e.backtrace.join("\n")
|
|
nil
|
|
end
|
|
|
|
# Fallback to querying DuckDB directly (for backward compatibility)
|
|
def search_duckdb(filters = {}, page: 1, per_page: 50)
|
|
with_events_from_parquet do |conn|
|
|
# Build WHERE clause
|
|
where_clause, params = build_where_clause(filters)
|
|
|
|
# Get total count
|
|
count_sql = "SELECT COUNT(*) FROM events#{where_clause}"
|
|
count_result = conn.query(count_sql, *params)
|
|
total_count = count_result.first&.first || 0
|
|
|
|
# Get paginated results
|
|
offset = (page - 1) * per_page
|
|
|
|
data_sql = <<~SQL
|
|
SELECT
|
|
id, timestamp, ip_address, network_range_id, country, company,
|
|
asn, asn_org, is_datacenter, is_vpn, is_proxy, is_bot,
|
|
waf_action, request_method, response_status, rule_id,
|
|
request_path, user_agent, tags
|
|
FROM events
|
|
#{where_clause}
|
|
ORDER BY timestamp DESC
|
|
LIMIT ? OFFSET ?
|
|
SQL
|
|
|
|
result = conn.query(data_sql, *params, per_page, offset)
|
|
|
|
# Convert rows to event-like objects
|
|
events = result.to_a.map { |row| row_to_event(row) }
|
|
|
|
{
|
|
total_count: total_count,
|
|
events: events,
|
|
page: page,
|
|
per_page: per_page
|
|
}
|
|
end
|
|
rescue StandardError => e
|
|
Rails.logger.error "[EventDdb] Error in DuckDB search: #{e.message}"
|
|
Rails.logger.error e.backtrace.join("\n")
|
|
nil
|
|
end
|
|
|
|
private
|
|
|
|
# Build WHERE clause and params from filters hash
|
|
# Returns [where_clause_string, params_array]
|
|
def build_where_clause(filters)
|
|
conditions = []
|
|
params = []
|
|
|
|
if filters[:ip].present?
|
|
conditions << "ip_address = ?"
|
|
params << filters[:ip]
|
|
end
|
|
|
|
if filters[:waf_action].present?
|
|
# Convert string action to integer
|
|
action_int = ACTION_MAP.key(filters[:waf_action].to_s)
|
|
if action_int
|
|
conditions << "waf_action = ?"
|
|
params << action_int
|
|
end
|
|
end
|
|
|
|
if filters[:country].present?
|
|
conditions << "country = ?"
|
|
params << filters[:country]
|
|
end
|
|
|
|
if filters[:rule_id].present?
|
|
conditions << "rule_id = ?"
|
|
params << filters[:rule_id].to_i
|
|
end
|
|
|
|
if filters[:company].present?
|
|
conditions << "company ILIKE ?"
|
|
params << "%#{filters[:company]}%"
|
|
end
|
|
|
|
if filters[:asn].present?
|
|
conditions << "asn = ?"
|
|
params << filters[:asn].to_i
|
|
end
|
|
|
|
if filters[:network_range_id].present?
|
|
conditions << "network_range_id = ?"
|
|
params << filters[:network_range_id].to_i
|
|
end
|
|
|
|
# Network type filter
|
|
if filters[:network_type].present?
|
|
case filters[:network_type].to_s.downcase
|
|
when "datacenter"
|
|
conditions << "is_datacenter = true"
|
|
when "vpn"
|
|
conditions << "is_vpn = true"
|
|
when "proxy"
|
|
conditions << "is_proxy = true"
|
|
when "standard"
|
|
conditions << "(is_datacenter = false AND is_vpn = false AND is_proxy = false)"
|
|
end
|
|
end
|
|
|
|
# Bot filtering
|
|
if filters[:exclude_bots] == true || filters[:exclude_bots] == "true"
|
|
conditions << "is_bot = false"
|
|
end
|
|
|
|
where_clause = conditions.any? ? " WHERE #{conditions.join(' AND ')}" : ""
|
|
[where_clause, params]
|
|
end
|
|
|
|
# Convert DuckDB row array to event-like OpenStruct
|
|
def row_to_event(row)
|
|
OpenStruct.new(
|
|
id: row[0],
|
|
timestamp: row[1],
|
|
ip_address: row[2],
|
|
network_range_id: row[3],
|
|
country: row[4],
|
|
company: row[5],
|
|
asn: row[6],
|
|
asn_org: row[7],
|
|
is_datacenter: row[8],
|
|
is_vpn: row[9],
|
|
is_proxy: row[10],
|
|
is_bot: row[11],
|
|
waf_action: ACTION_MAP[row[12]] || "unknown",
|
|
request_method: METHOD_MAP[row[13]],
|
|
response_status: row[14],
|
|
rule_id: row[15],
|
|
request_path: row[16],
|
|
user_agent: row[17],
|
|
tags: row[18] || [],
|
|
# Add helper method for country lookup
|
|
lookup_country: row[4],
|
|
# Network range will be loaded separately in controller
|
|
network_range: nil,
|
|
rule: nil
|
|
)
|
|
end
|
|
end
|
|
end
|