#!/usr/bin/env ruby # frozen_string_literal: true # One-time backfill script to populate new columns in existing DuckDB events # This uses DuckDB's JOIN-based UPDATE for maximum performance require 'csv' require 'tempfile' puts "DuckDB Column Backfill Script (JOIN-based UPDATE)" puts "=" * 60 puts "This will update existing DuckDB events with data from PostgreSQL" puts "using a fast JOIN-based approach" puts BATCH_SIZE = 50_000 AnalyticsDuckdbService.instance.with_connection do |conn| # Get total events in DuckDB puts "Step 1: Counting events to backfill..." result = conn.query("SELECT COUNT(*) FROM events WHERE request_method IS NULL") total_to_backfill = result.first&.first || 0 result = conn.query("SELECT COUNT(*) FROM events") total_events = result.first&.first || 0 puts " Total events in DuckDB: #{total_events}" puts " Events needing backfill: #{total_to_backfill}" if total_to_backfill == 0 puts "\n✓ All events already have new columns populated!" exit 0 end # Get min and max event IDs in DuckDB result = conn.query("SELECT MIN(id), MAX(id) FROM events WHERE request_method IS NULL") min_id, max_id = result.first puts " ID range to backfill: #{min_id} to #{max_id}" puts "\nStep 2: Exporting PostgreSQL data in batches..." current_id = min_id batch_num = 0 total_updated = 0 # Create temporary CSV file for data transfer temp_csv = Tempfile.new(['events_backfill', '.csv']) begin CSV.open(temp_csv.path, 'w') do |csv| # Header csv << ['id', 'request_method', 'response_status', 'rule_id'] while current_id <= max_id batch_num += 1 batch_end_id = [current_id + BATCH_SIZE - 1, max_id].min print " Batch #{batch_num}: Exporting IDs #{current_id}-#{batch_end_id}..." # Fetch from PostgreSQL pg_events = Event.where("id >= ? AND id <= ?", current_id, batch_end_id) .select(:id, :request_method, :response_status, :rule_id) count = 0 pg_events.find_each do |event| csv << [ event.id, event.request_method, event.response_status, event.rule_id ] count += 1 end puts " #{count} events" current_id = batch_end_id + 1 end end temp_csv.close puts "\n✓ Exported to temporary CSV: #{temp_csv.path}" puts " File size: #{(File.size(temp_csv.path) / 1024.0 / 1024.0).round(2)} MB" puts "\nStep 3: Loading CSV into temporary DuckDB table..." conn.execute("DROP TABLE IF EXISTS events_updates") conn.execute(<<~SQL) CREATE TABLE events_updates ( id BIGINT, request_method INTEGER, response_status INTEGER, rule_id BIGINT ) SQL conn.execute(<<~SQL) COPY events_updates FROM '#{temp_csv.path}' (FORMAT CSV, HEADER TRUE, NULL '') SQL result = conn.query("SELECT COUNT(*) FROM events_updates") loaded_count = result.first&.first || 0 puts "✓ Loaded #{loaded_count} rows into temporary table" puts "\nStep 4: Performing bulk UPDATE via JOIN..." start_time = Time.current # DuckDB's efficient UPDATE...FROM syntax conn.execute(<<~SQL) UPDATE events SET request_method = events_updates.request_method, response_status = events_updates.response_status, rule_id = events_updates.rule_id FROM events_updates WHERE events.id = events_updates.id SQL duration = Time.current - start_time puts "✓ Bulk update complete in #{duration.round(2)}s!" puts "\nStep 5: Cleaning up temporary table..." conn.execute("DROP TABLE events_updates") puts "✓ Temporary table dropped" ensure # Clean up temp file temp_csv.unlink if temp_csv end puts "\nStep 6: Verifying backfill..." result = conn.query("SELECT COUNT(*) FROM events WHERE request_method IS NOT NULL OR response_status IS NOT NULL OR rule_id IS NOT NULL") filled_count = result.first&.first || 0 result = conn.query("SELECT COUNT(*) FROM events WHERE request_method IS NULL AND response_status IS NULL AND rule_id IS NULL") still_null_count = result.first&.first || 0 puts " Events with new columns populated: #{filled_count}" puts " Events still with NULL columns: #{still_null_count}" if still_null_count > 0 puts "\n⚠ Note: #{still_null_count} events still have NULL values." puts " This is normal if those events don't exist in PostgreSQL anymore" puts " (they may have been cleaned up due to retention policy)" else puts "\n✓ Backfill complete! All events have new columns populated." end end puts "\n" + "=" * 60 puts "Backfill complete!" puts "\nNext steps:" puts "1. Test the events index page to verify everything works" puts "2. Monitor performance improvements from DuckDB queries"