Fix some blocked/allow laggards after migrating. Add DuckDB for outstanding analyitcs performance. Start adding an import for all bot networks

This commit is contained in:
Dan Milne
2025-11-18 16:40:05 +11:00
parent ef56779584
commit 3f274c842c
37 changed files with 3522 additions and 151 deletions

View File

@@ -0,0 +1,52 @@
# frozen_string_literal: true
# CleanupOldEventsJob - Removes events older than the configured retention period
#
# This job runs periodically (hourly) to clean up old events based on the
# event_retention_days setting. This helps keep the database size manageable
# and improves query performance.
#
# The retention period is configurable via the 'event_retention_days' setting
# (default: 90 days). This allows administrators to balance between historical
# data retention and database performance.
#
# Schedule: Every hour (configured in config/recurring.yml)
class CleanupOldEventsJob < ApplicationJob
queue_as :background
def perform
retention_days = Setting.event_retention_days
# Don't delete if retention is set to 0 or negative (disabled)
if retention_days <= 0
Rails.logger.info "CleanupOldEventsJob: Event retention disabled (retention_days: #{retention_days})"
return 0
end
cutoff_date = retention_days.days.ago
# Count events to be deleted
old_events = Event.where('timestamp < ?', cutoff_date)
count = old_events.count
if count.zero?
Rails.logger.info "CleanupOldEventsJob: No events older than #{retention_days} days found"
return 0
end
Rails.logger.info "CleanupOldEventsJob: Deleting #{count} events older than #{retention_days} days (before #{cutoff_date})"
# Delete in batches to avoid long-running transactions
deleted_count = 0
batch_size = 10_000
old_events.in_batches(of: batch_size) do |batch|
batch_count = batch.delete_all
deleted_count += batch_count
Rails.logger.info "CleanupOldEventsJob: Deleted batch of #{batch_count} events (total: #{deleted_count}/#{count})"
end
Rails.logger.info "CleanupOldEventsJob: Successfully deleted #{deleted_count} events"
deleted_count
end
end

View File

@@ -15,31 +15,14 @@ class FetchIpapiDataJob < ApplicationJob
ipapi_data = Ipapi.lookup(sample_ip)
if ipapi_data.present? && !ipapi_data.key?('error')
# Check if IPAPI returned a different route than our tracking network
ipapi_route = ipapi_data.dig('asn', 'route')
target_network = tracking_network
# Process IPAPI data and create network ranges
result = Ipapi.process_ipapi_data(ipapi_data, tracking_network)
if ipapi_route.present? && ipapi_route != tracking_network.cidr
# IPAPI returned a different CIDR - find or create that network range
Rails.logger.info "IPAPI returned different route: #{ipapi_route} (requested: #{tracking_network.cidr})"
# Mark the tracking network as having been queried
# Use the broadest CIDR returned for deduplication
tracking_network.mark_ipapi_queried!(result[:broadest_cidr])
target_network = NetworkRange.find_or_create_by(network: ipapi_route) do |nr|
nr.source = 'api_imported'
nr.creation_reason = "Created from IPAPI lookup for #{tracking_network.cidr}"
end
Rails.logger.info "Storing IPAPI data on correct network: #{target_network.cidr}"
end
# Store data on the target network (wherever IPAPI said it belongs)
target_network.set_network_data(:ipapi, ipapi_data)
target_network.last_api_fetch = Time.current
target_network.save!
# Mark the tracking network as having been queried, with the CIDR that was returned
tracking_network.mark_ipapi_queried!(target_network.cidr)
Rails.logger.info "Successfully fetched IPAPI data for #{tracking_network.cidr} (stored on #{target_network.cidr})"
Rails.logger.info "Successfully fetched IPAPI data for #{tracking_network.cidr} (created #{result[:networks].length} networks)"
# Broadcast to the tracking network
broadcast_ipapi_update(tracking_network, ipapi_data)

View File

@@ -0,0 +1,26 @@
# frozen_string_literal: true
# ImportAllBotNetworkRangesJob - Background job for importing from all bot sources
class ImportAllBotNetworkRangesJob < ApplicationJob
queue_as :default
def perform(options = {})
Rails.logger.info "Starting batch import of all bot network ranges"
results = BotNetworkRangeImporter.import_all_sources(options)
# Send completion summary
Rails.logger.info "Batch import completed. Summary: #{results}"
# Broadcast summary to clients
ActionCable.server.broadcast(
"bot_imports",
{
type: 'batch_summary',
status: 'completed',
results: results,
message: "Batch import completed for all sources"
}
)
end
end

View File

@@ -0,0 +1,47 @@
# frozen_string_literal: true
# ImportBotNetworkRangesJob - Background job for importing bot network ranges
#
# Imports network ranges from official bot provider sources.
# Runs asynchronously to avoid blocking the web interface.
class ImportBotNetworkRangesJob < ApplicationJob
queue_as :default
def perform(source_key, options = {})
Rails.logger.info "Starting bot network range import for source: #{source_key}"
begin
result = BotNetworkRangeImporter.import_from_source(source_key, options)
# Send notification or log completion
Rails.logger.info "Successfully imported #{result[:imported]} ranges from #{result[:source]}"
# Optionally broadcast via Turbo Streams for real-time updates
ActionCable.server.broadcast(
"bot_imports",
{
source: source_key,
status: 'completed',
imported: result[:imported],
message: "Successfully imported #{result[:imported]} ranges from #{result[:source]}"
}
)
rescue => e
Rails.logger.error "Bot network range import failed for #{source_key}: #{e.message}"
# Broadcast error notification
ActionCable.server.broadcast(
"bot_imports",
{
source: source_key,
status: 'error',
error: e.message,
message: "Failed to import from #{source_key}: #{e.message}"
}
)
raise e
end
end
end

View File

@@ -53,16 +53,15 @@ class ProcessWafEventJob < ApplicationJob
# Queue IPAPI enrichment based on /24 tracking
# The tracking network is the /24 that stores ipapi_queried_at
if NetworkRange.should_fetch_ipapi_for_ip?(event.ip_address)
# Use tracking network for fetch status to avoid race conditions
if tracking_network.is_fetching_api_data?(:ipapi)
Rails.logger.info "Skipping IPAPI fetch for #{tracking_network.cidr} - already being fetched"
else
tracking_network.mark_as_fetching_api_data!(:ipapi)
# Atomically mark as fetching - this prevents duplicate jobs via database lock
if tracking_network.mark_as_fetching_api_data!(:ipapi)
Rails.logger.info "Queueing IPAPI fetch for IP #{event.ip_address} (tracking network: #{tracking_network.cidr})"
FetchIpapiDataJob.perform_later(network_range_id: tracking_network.id)
else
Rails.logger.info "Skipping IPAPI fetch for #{tracking_network.cidr} - another job already started"
end
else
Rails.logger.debug "Skipping IPAPI fetch for IP #{event.ip_address} - already queried recently"
Rails.logger.debug "Skipping IPAPI fetch for IP #{event.ip_address} - already queried or being fetched"
end
# Evaluate WAF policies inline if needed (lazy evaluation)

View File

@@ -0,0 +1,89 @@
# frozen_string_literal: true
# Background job to sync events from PostgreSQL to DuckDB
# Runs every 5 minutes to keep analytics database up-to-date
# Uses watermark tracking to only sync new events
class SyncEventsToDuckdbJob < ApplicationJob
queue_as :default
# Key for storing last sync timestamp in Rails cache
WATERMARK_CACHE_KEY = "duckdb_last_sync_time"
WATERMARK_TTL = 1.week
# Overlap window to catch late-arriving events
SYNC_OVERLAP = 1.minute
def perform
service = AnalyticsDuckdbService.instance
# Determine where to start syncing
from_timestamp = determine_sync_start_time(service)
Rails.logger.info "[DuckDB Sync] Starting sync from #{from_timestamp}"
# Sync new events using PostgreSQL cursor + DuckDB Appender
# (setup_schema is called internally within sync_new_events)
count = service.sync_new_events(from_timestamp)
# Update watermark if we synced any events
if count > 0
update_last_sync_time
Rails.logger.info "[DuckDB Sync] Successfully synced #{count} events"
else
Rails.logger.info "[DuckDB Sync] No new events to sync"
end
rescue StandardError => e
Rails.logger.error "[DuckDB Sync] Job failed: #{e.message}"
Rails.logger.error e.backtrace.join("\n")
raise # Re-raise to mark job as failed in Solid Queue
end
private
# Determine timestamp to start syncing from
# Strategy:
# 1. First run (DuckDB empty): sync from oldest PostgreSQL event
# 2. Subsequent runs: sync from last watermark with overlap
def determine_sync_start_time(service)
oldest_duckdb = service.oldest_event_timestamp
if oldest_duckdb.nil?
# DuckDB is empty - this is the first sync
# Start from oldest PostgreSQL event (or reasonable cutoff)
oldest_pg = Event.minimum(:timestamp)
if oldest_pg.nil?
# No events in PostgreSQL at all
Rails.logger.warn "[DuckDB Sync] No events found in PostgreSQL"
1.day.ago # Default to recent window
else
Rails.logger.info "[DuckDB Sync] First sync - starting from oldest event: #{oldest_pg}"
oldest_pg
end
else
# DuckDB has data - sync from last watermark with overlap
last_sync = Rails.cache.read(WATERMARK_CACHE_KEY)
if last_sync.nil?
# Watermark not in cache (maybe cache expired or restarted)
# Fall back to newest event in DuckDB
newest_duckdb = service.newest_event_timestamp
start_time = newest_duckdb ? newest_duckdb - SYNC_OVERLAP : oldest_duckdb
Rails.logger.info "[DuckDB Sync] Watermark not found, using newest DuckDB event: #{start_time}"
start_time
else
# Normal case: use watermark with overlap to catch late arrivals
start_time = last_sync - SYNC_OVERLAP
Rails.logger.debug "[DuckDB Sync] Using watermark: #{last_sync} (with #{SYNC_OVERLAP}s overlap)"
start_time
end
end
end
# Update last sync watermark in cache
def update_last_sync_time
now = Time.current
Rails.cache.write(WATERMARK_CACHE_KEY, now, expires_in: WATERMARK_TTL)
Rails.logger.debug "[DuckDB Sync] Updated watermark to #{now}"
end
end