Use only parquet files for events
This commit is contained in:
@@ -31,9 +31,31 @@ class EventDdb
|
||||
AnalyticsDuckdbService.instance
|
||||
end
|
||||
|
||||
# Helper to load parquet files into in-memory events view
|
||||
# This allows all existing queries to work without modification
|
||||
# Uses glob pattern to read all parquet files (excluding .temp files)
|
||||
def with_events_from_parquet(&block)
|
||||
service.with_connection do |conn|
|
||||
# Create events view from all parquet files using glob pattern
|
||||
# Pattern matches: minute/*.parquet, hours/*.parquet, days/*.parquet, weeks/*.parquet
|
||||
# Excludes .temp files automatically (they don't match *.parquet)
|
||||
parquet_pattern = "#{AnalyticsDuckdbService::PARQUET_BASE_PATH}/**/*.parquet"
|
||||
|
||||
conn.execute(<<~SQL)
|
||||
CREATE VIEW events AS
|
||||
SELECT * FROM read_parquet('#{parquet_pattern}')
|
||||
SQL
|
||||
|
||||
yield conn
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error loading parquet files: #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
# Total events since timestamp
|
||||
def count_since(start_time)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query("SELECT COUNT(*) as count FROM events WHERE timestamp >= ?", start_time)
|
||||
result.first&.first || 0
|
||||
end
|
||||
@@ -44,7 +66,7 @@ class EventDdb
|
||||
|
||||
# Event breakdown by WAF action
|
||||
def breakdown_by_action(start_time)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time)
|
||||
SELECT waf_action, COUNT(*) as count
|
||||
FROM events
|
||||
@@ -65,7 +87,7 @@ class EventDdb
|
||||
|
||||
# Top countries with event counts
|
||||
def top_countries(start_time, limit = 10)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time, limit)
|
||||
SELECT country, COUNT(*) as count
|
||||
FROM events
|
||||
@@ -86,7 +108,7 @@ class EventDdb
|
||||
|
||||
# Top blocked IPs
|
||||
def top_blocked_ips(start_time, limit = 10)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time, limit)
|
||||
SELECT ip_address, COUNT(*) as count
|
||||
FROM events
|
||||
@@ -106,7 +128,7 @@ class EventDdb
|
||||
|
||||
# Hourly timeline aggregation
|
||||
def hourly_timeline(start_time, end_time)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time, end_time)
|
||||
SELECT
|
||||
DATE_TRUNC('hour', timestamp) as hour,
|
||||
@@ -129,7 +151,7 @@ class EventDdb
|
||||
# Top networks by traffic volume
|
||||
# Returns array of arrays: [network_range_id, event_count, unique_ips]
|
||||
def top_networks(start_time, limit = 50)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time, limit)
|
||||
SELECT
|
||||
network_range_id,
|
||||
@@ -152,7 +174,7 @@ class EventDdb
|
||||
# Top companies
|
||||
# Returns array of OpenStruct objects with: company, event_count, unique_ips, network_count
|
||||
def top_companies(start_time, limit = 20)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time, limit)
|
||||
SELECT
|
||||
company,
|
||||
@@ -184,7 +206,7 @@ class EventDdb
|
||||
# Top ASNs
|
||||
# Returns array of OpenStruct objects with: asn, asn_org, event_count, unique_ips, network_count
|
||||
def top_asns(start_time, limit = 15)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time, limit)
|
||||
SELECT
|
||||
asn,
|
||||
@@ -218,7 +240,7 @@ class EventDdb
|
||||
# Network type breakdown (datacenter, VPN, proxy, standard)
|
||||
# Returns hash with network_type as key and hash of stats as value
|
||||
def network_type_breakdown(start_time)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time)
|
||||
SELECT
|
||||
CASE
|
||||
@@ -255,7 +277,7 @@ class EventDdb
|
||||
# Top countries with detailed stats (event count and unique IPs)
|
||||
# Returns array of OpenStruct objects with: country, event_count, unique_ips
|
||||
def top_countries_with_stats(start_time, limit = 15)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time, limit)
|
||||
SELECT
|
||||
country,
|
||||
@@ -285,7 +307,7 @@ class EventDdb
|
||||
# Network type stats with formatted output matching controller expectations
|
||||
# Returns hash with type keys containing label, networks, events, unique_ips, percentage
|
||||
def network_type_stats(start_time)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
# Get total events for percentage calculation
|
||||
total_result = conn.query("SELECT COUNT(*) as total FROM events WHERE timestamp >= ?", start_time)
|
||||
total_events = total_result.first&.first || 0
|
||||
@@ -328,7 +350,7 @@ class EventDdb
|
||||
network_range_ids = Array(network_range_ids)
|
||||
return nil if network_range_ids.empty?
|
||||
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
# Build IN clause with placeholders
|
||||
placeholders = network_range_ids.map { "?" }.join(", ")
|
||||
|
||||
@@ -363,7 +385,7 @@ class EventDdb
|
||||
network_range_ids = Array(network_range_ids)
|
||||
return nil if network_range_ids.empty?
|
||||
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
# Build IN clause with placeholders
|
||||
placeholders = network_range_ids.map { "?" }.join(", ")
|
||||
|
||||
@@ -391,7 +413,7 @@ class EventDdb
|
||||
network_range_ids = Array(network_range_ids)
|
||||
return nil if network_range_ids.empty?
|
||||
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
# Build IN clause with placeholders
|
||||
placeholders = network_range_ids.map { "?" }.join(", ")
|
||||
|
||||
@@ -414,13 +436,36 @@ class EventDdb
|
||||
nil
|
||||
end
|
||||
|
||||
# Count events for network range(s)
|
||||
# Returns integer count of all events in the network
|
||||
def network_event_count(network_range_ids)
|
||||
network_range_ids = Array(network_range_ids)
|
||||
return nil if network_range_ids.empty?
|
||||
|
||||
with_events_from_parquet do |conn|
|
||||
# Build IN clause with placeholders
|
||||
placeholders = network_range_ids.map { "?" }.join(", ")
|
||||
|
||||
result = conn.query(<<~SQL, *network_range_ids)
|
||||
SELECT COUNT(*) as count
|
||||
FROM events
|
||||
WHERE network_range_id IN (#{placeholders})
|
||||
SQL
|
||||
|
||||
result.first&.first || 0
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in network_event_count: #{e.message}"
|
||||
nil
|
||||
end
|
||||
|
||||
# Full user agent tally for network range(s)
|
||||
# Returns hash of user_agent => count for all agents in the network
|
||||
def network_agent_tally(network_range_ids)
|
||||
network_range_ids = Array(network_range_ids)
|
||||
return nil if network_range_ids.empty?
|
||||
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
# Build IN clause with placeholders
|
||||
placeholders = network_range_ids.map { "?" }.join(", ")
|
||||
|
||||
@@ -445,7 +490,7 @@ class EventDdb
|
||||
# Suspicious network activity patterns
|
||||
# Detects high-volume networks, high deny rates, and distributed companies
|
||||
def suspicious_patterns(start_time)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
# High volume networks (5x average)
|
||||
avg_query = conn.query(<<~SQL, start_time)
|
||||
SELECT
|
||||
@@ -523,7 +568,7 @@ class EventDdb
|
||||
|
||||
# Bot traffic analysis - breakdown of bot vs human traffic
|
||||
def bot_traffic_breakdown(start_time)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time)
|
||||
SELECT
|
||||
is_bot,
|
||||
@@ -553,7 +598,7 @@ class EventDdb
|
||||
|
||||
# Count human traffic (non-bot) since timestamp
|
||||
def human_traffic_count(start_time)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time)
|
||||
SELECT COUNT(*) as count
|
||||
FROM events
|
||||
@@ -569,7 +614,7 @@ class EventDdb
|
||||
|
||||
# Count bot traffic since timestamp
|
||||
def bot_traffic_count(start_time)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time)
|
||||
SELECT COUNT(*) as count
|
||||
FROM events
|
||||
@@ -585,7 +630,7 @@ class EventDdb
|
||||
|
||||
# Top bot user agents
|
||||
def top_bot_user_agents(start_time, limit = 20)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time, limit)
|
||||
SELECT
|
||||
user_agent,
|
||||
@@ -614,7 +659,7 @@ class EventDdb
|
||||
|
||||
# Bot traffic timeline (hourly breakdown)
|
||||
def bot_traffic_timeline(start_time, end_time)
|
||||
service.with_connection do |conn|
|
||||
with_events_from_parquet do |conn|
|
||||
result = conn.query(<<~SQL, start_time, end_time)
|
||||
SELECT
|
||||
DATE_TRUNC('hour', timestamp) as hour,
|
||||
@@ -648,7 +693,63 @@ class EventDdb
|
||||
# Returns { total_count:, events:[], page:, per_page: }
|
||||
# Supports filters: ip, waf_action, country, rule_id, company, asn, network_type, network_range_id, exclude_bots
|
||||
def search(filters = {}, page: 1, per_page: 50)
|
||||
service.with_connection do |conn|
|
||||
# Get list of Parquet files to query
|
||||
parquet_files = service.parquet_files_for_range(1.year.ago, Time.current)
|
||||
|
||||
if parquet_files.empty?
|
||||
Rails.logger.warn "[EventDdb] No Parquet files found, falling back to DuckDB"
|
||||
return search_duckdb(filters, page, per_page)
|
||||
end
|
||||
|
||||
# Query Parquet files using in-memory DuckDB (no file locks!)
|
||||
service.with_parquet_connection do |conn|
|
||||
# Build WHERE clause
|
||||
where_clause, params = build_where_clause(filters)
|
||||
|
||||
# Build file list for read_parquet
|
||||
file_list = parquet_files.map { |f| "'#{f}'" }.join(", ")
|
||||
|
||||
# Get total count
|
||||
count_sql = "SELECT COUNT(*) FROM read_parquet([#{file_list}])#{where_clause}"
|
||||
count_result = conn.query(count_sql, *params)
|
||||
total_count = count_result.first&.first || 0
|
||||
|
||||
# Get paginated results
|
||||
offset = (page - 1) * per_page
|
||||
|
||||
data_sql = <<~SQL
|
||||
SELECT
|
||||
id, timestamp, ip_address, network_range_id, country, company,
|
||||
asn, asn_org, is_datacenter, is_vpn, is_proxy, is_bot,
|
||||
waf_action, request_method, response_status, rule_id,
|
||||
request_path, user_agent, tags
|
||||
FROM read_parquet([#{file_list}])
|
||||
#{where_clause}
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT ? OFFSET ?
|
||||
SQL
|
||||
|
||||
result = conn.query(data_sql, *params, per_page, offset)
|
||||
|
||||
# Convert rows to event-like objects
|
||||
events = result.to_a.map { |row| row_to_event(row) }
|
||||
|
||||
{
|
||||
total_count: total_count,
|
||||
events: events,
|
||||
page: page,
|
||||
per_page: per_page
|
||||
}
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in Parquet search: #{e.message}"
|
||||
Rails.logger.error e.backtrace.join("\n")
|
||||
nil
|
||||
end
|
||||
|
||||
# Fallback to querying DuckDB directly (for backward compatibility)
|
||||
def search_duckdb(filters = {}, page: 1, per_page: 50)
|
||||
with_events_from_parquet do |conn|
|
||||
# Build WHERE clause
|
||||
where_clause, params = build_where_clause(filters)
|
||||
|
||||
@@ -685,7 +786,7 @@ class EventDdb
|
||||
}
|
||||
end
|
||||
rescue StandardError => e
|
||||
Rails.logger.error "[EventDdb] Error in search: #{e.message}"
|
||||
Rails.logger.error "[EventDdb] Error in DuckDB search: #{e.message}"
|
||||
Rails.logger.error e.backtrace.join("\n")
|
||||
nil
|
||||
end
|
||||
|
||||
@@ -15,6 +15,7 @@ class NetworkRange < ApplicationRecord
|
||||
|
||||
# Associations
|
||||
has_many :rules, dependent: :destroy
|
||||
has_many :events, foreign_key: :network_range_id, dependent: :nullify
|
||||
belongs_to :user, optional: true
|
||||
|
||||
# Validations
|
||||
@@ -36,8 +37,8 @@ class NetworkRange < ApplicationRecord
|
||||
scope :geolite_imported, -> { where(source: ['geolite_asn', 'geolite_country']) }
|
||||
scope :geolite_asn, -> { where(source: 'geolite_asn') }
|
||||
scope :geolite_country, -> { where(source: 'geolite_country') }
|
||||
scope :with_events, -> { where("events_count > 0") }
|
||||
scope :most_active, -> { order(events_count: :desc) }
|
||||
scope :with_events, -> { joins(:events).distinct }
|
||||
scope :most_active, -> { joins(:events).group('network_ranges.id').order('COUNT(events.id) DESC') }
|
||||
|
||||
# Callbacks
|
||||
before_validation :set_default_source
|
||||
@@ -241,7 +242,7 @@ class NetworkRange < ApplicationRecord
|
||||
def agent_tally
|
||||
Rails.cache.fetch("#{cache_key}:agent_tally", expires_in: 5.minutes) do
|
||||
# Use DuckDB for fast agent tally instead of loading all events into memory
|
||||
if persisted? && events_count > 0
|
||||
if persisted? && has_events?
|
||||
# Include child network ranges to capture all traffic within this network block
|
||||
network_ids = [id] + child_ranges.pluck(:id)
|
||||
|
||||
@@ -417,10 +418,16 @@ class NetworkRange < ApplicationRecord
|
||||
cidr.to_s.gsub('/', '_')
|
||||
end
|
||||
|
||||
# Analytics methods - events_count is now a counter cache column maintained by database triggers
|
||||
# This is much more performant than the previous implementation that did complex network queries
|
||||
def events_count
|
||||
self[:events_count] || 0
|
||||
# Check if network range has any events using DuckDB for performance
|
||||
def has_events?
|
||||
return false unless persisted?
|
||||
|
||||
# Include child network ranges to capture all traffic within this network block
|
||||
network_ids = [id] + child_ranges.pluck(:id)
|
||||
|
||||
# Try DuckDB first for fast event count check
|
||||
event_count = with_duckdb_fallback { EventDdb.network_event_count(network_ids) }
|
||||
event_count&.positive? || events.exists?
|
||||
end
|
||||
|
||||
def events
|
||||
|
||||
Reference in New Issue
Block a user