require 'csv' class GeoliteCountryImporter BATCH_SIZE = 1000 def initialize(file_path, data_import:) @file_path = file_path @data_import = data_import @total_records = 0 @processed_records = 0 @failed_records = 0 @errors = [] @locations_cache = {} end def import Rails.logger.info "Starting import for file: #{@file_path}" Rails.logger.info "File exists: #{File.exist?(@file_path)}" Rails.logger.info "File size: #{File.size(@file_path)} bytes" if File.exist?(@file_path) # Check if file is actually a zip by reading the magic bytes is_zip_file = check_if_zip_file Rails.logger.info "File is zip: #{is_zip_file}" if is_zip_file Rails.logger.info "Calling import_from_zip" import_from_zip else Rails.logger.info "Calling regular import (not zip)" load_locations_data import_csv_file(@file_path) end { total_records: @total_records, processed_records: @processed_records, failed_records: @failed_records, errors: @errors } end private def check_if_zip_file # Check if the file starts with ZIP magic bytes (PK\x03\x04) File.open(@file_path, 'rb') do |file| header = file.read(4) return header == "PK\x03\x04" end rescue => e Rails.logger.error "Error checking if file is zip: #{e.message}" false end def import_from_zip require 'zip' require 'stringio' Rails.logger.info "Processing zip file directly: #{@file_path}" # Read the entire ZIP file content into memory first zip_content = File.binread(@file_path) Zip::File.open_buffer(StringIO.new(zip_content)) do |zip_file| # First, see what's in the zip Rails.logger.info "Files in zip:" zip_file.each do |entry| Rails.logger.info " - #{entry.name} (#{entry.size} bytes)" end # First, load location data from zip load_locations_data_from_zip(zip_file) # Then process block files from zip zip_file.each do |entry| if entry.name.include?('Blocks') && entry.name.end_with?('.csv') Rails.logger.info "Processing block file from zip: #{entry.name}" process_csv_from_zip(zip_file, entry) end end end rescue => e Rails.logger.error "Error processing ZIP file: #{e.message}" Rails.logger.error e.backtrace.join("\n") raise end def process_csv_from_zip(zip_file, entry) zip_file.get_input_stream(entry) do |io| # Read the entire content from the stream content = io.read # Try different encodings if UTF-8 fails encodings = ['UTF-8', 'ISO-8859-1', 'Windows-1252'] encoding_used = nil encodings.each do |encoding| begin # Parse the CSV content from the string CSV.parse(content, headers: true, header_converters: :symbol, encoding: encoding) do |row| @total_records += 1 begin import_record(row) @processed_records += 1 rescue => e @failed_records += 1 @errors << "Row #{@total_records}: #{e.message} - Data: #{row.to_h}" end update_progress_if_needed end encoding_used = encoding Rails.logger.info "Successfully processed #{entry.name} with #{encoding} encoding" break rescue CSV::InvalidEncodingError => e Rails.logger.warn "Failed to process #{entry.name} with #{encoding} encoding: #{e.message}" next if encoding != encodings.last raise e if encoding == encodings.last end end unless encoding_used @errors << "Failed to process #{entry.name} with any supported encoding" end end end def load_locations_data_from_zip(zip_file) require 'zip' # Find all location files and prioritize English location_entries = zip_file.select { |entry| entry.name.include?('Locations') && entry.name.end_with?('.csv') } # Sort to prioritize English locations file location_entries.sort_by! { |entry| entry.name.include?('Locations-en') ? 0 : 1 } location_entries.each do |entry| Rails.logger.info "Loading locations from: #{entry.name}" zip_file.get_input_stream(entry) do |io| # Read the entire content from the stream content = io.read # Try different encodings if UTF-8 fails encodings = ['UTF-8', 'ISO-8859-1', 'Windows-1252'] encodings.each do |encoding| begin # Parse the CSV content from the string CSV.parse(content, headers: true, header_converters: :symbol, encoding: encoding) do |row| geoname_id = row[:geoname_id] next unless geoname_id @locations_cache[geoname_id] = { country_iso_code: row[:country_iso_code], country_name: row[:country_name], continent_code: row[:continent_code], continent_name: row[:continent_name], is_in_european_union: row[:is_in_european_union] } end Rails.logger.info "Loaded locations from #{entry.name} with #{encoding} encoding" break rescue CSV::InvalidEncodingError => e Rails.logger.warn "Failed to load locations from #{entry.name} with #{encoding} encoding: #{e.message}" next if encoding != encodings.last raise e if encoding == encodings.last end end end end Rails.logger.info "Loaded #{@locations_cache.size} location records" end def import_csv_file(csv_file) CSV.foreach(csv_file, headers: true, header_converters: :symbol, encoding: 'UTF-8') do |row| @total_records += 1 begin import_record(row) @processed_records += 1 rescue => e @failed_records += 1 @errors << "Row #{@total_records}: #{e.message} - Data: #{row.to_h}" # Update progress every 100 records or on error update_progress_if_needed end update_progress_if_needed end end def import_record(row) network = row[:network] geoname_id = row[:geoname_id] registered_country_geoname_id = row[:registered_country_geoname_id] is_anonymous_proxy = row[:is_anonymous_proxy] == '1' is_satellite_provider = row[:is_satellite_provider] == '1' is_anycast = row[:is_anycast] == '1' unless network raise "Missing required field: network" end # Validate network format IPAddr.new(network) # This will raise if invalid # Get location data - prefer geoname_id, then registered_country_geoname_id location_data = @locations_cache[geoname_id] || @locations_cache[registered_country_geoname_id] || {} # Store raw GeoLite country data in network_data[:geolite] geolite_data = { country: { geoname_id: geoname_id, registered_country_geoname_id: registered_country_geoname_id, represented_country_geoname_id: row[:represented_country_geoname_id], continent_code: location_data[:continent_code], continent_name: location_data[:continent_name], country_name: location_data[:country_name], country_iso_code: location_data[:country_iso_code], is_in_european_union: location_data[:is_in_european_union], is_anonymous_proxy: is_anonymous_proxy, is_satellite_provider: is_satellite_provider, is_anycast: is_anycast } }.compact NetworkRange.upsert( { network: network, country: location_data[:country_iso_code], is_proxy: is_anonymous_proxy, source: 'geolite_country', network_data: { geolite: geolite_data }, updated_at: Time.current }, unique_by: :index_network_ranges_on_network_unique ) end def update_progress_if_needed if (@processed_records + @failed_records) % 100 == 0 @data_import.update_progress( processed: @processed_records, failed: @failed_records, total_records: @total_records, stats: { total_records: @total_records, current_file: File.basename(@file_path), locations_loaded: @locations_cache.size, recent_errors: @errors.last(5) } ) end end def load_locations_data locations_files = find_locations_files locations_files.each do |locations_file| CSV.foreach(locations_file, headers: true, header_converters: :symbol, encoding: 'UTF-8') do |row| geoname_id = row[:geoname_id] next unless geoname_id @locations_cache[geoname_id] = { country_iso_code: row[:country_iso_code], country_name: row[:country_name], continent_code: row[:continent_code], continent_name: row[:continent_name], is_in_european_union: row[:is_in_european_union] } end end end def find_locations_files if @file_path.end_with?('.zip') base_dir = File.dirname(@file_path) base_name = File.basename(@file_path, '.zip') # Look for English locations file first, then any locations file [ File.join(base_dir, "#{base_name}-Locations-en.csv"), Dir[File.join(base_dir, "#{base_name}-Locations-*.csv")].first ].compact.select { |file| File.exist?(file) } else base_dir = File.dirname(@file_path) Dir[File.join(base_dir, "*Locations*.csv")].select { |file| File.exist?(file) } end end end