Add 'tags' to event model. Add a dataimport system - currently for MaxMind zip files
This commit is contained in:
288
app/services/geolite_country_importer.rb
Normal file
288
app/services/geolite_country_importer.rb
Normal file
@@ -0,0 +1,288 @@
|
||||
require 'csv'
|
||||
|
||||
class GeoliteCountryImporter
|
||||
BATCH_SIZE = 1000
|
||||
|
||||
def initialize(file_path, data_import:)
|
||||
@file_path = file_path
|
||||
@data_import = data_import
|
||||
@total_records = 0
|
||||
@processed_records = 0
|
||||
@failed_records = 0
|
||||
@errors = []
|
||||
@locations_cache = {}
|
||||
end
|
||||
|
||||
def import
|
||||
Rails.logger.info "Starting import for file: #{@file_path}"
|
||||
Rails.logger.info "File exists: #{File.exist?(@file_path)}"
|
||||
Rails.logger.info "File size: #{File.size(@file_path)} bytes" if File.exist?(@file_path)
|
||||
|
||||
# Check if file is actually a zip by reading the magic bytes
|
||||
is_zip_file = check_if_zip_file
|
||||
Rails.logger.info "File is zip: #{is_zip_file}"
|
||||
|
||||
if is_zip_file
|
||||
Rails.logger.info "Calling import_from_zip"
|
||||
import_from_zip
|
||||
else
|
||||
Rails.logger.info "Calling regular import (not zip)"
|
||||
load_locations_data
|
||||
import_csv_file(@file_path)
|
||||
end
|
||||
|
||||
{
|
||||
total_records: @total_records,
|
||||
processed_records: @processed_records,
|
||||
failed_records: @failed_records,
|
||||
errors: @errors
|
||||
}
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def check_if_zip_file
|
||||
# Check if the file starts with ZIP magic bytes (PK\x03\x04)
|
||||
File.open(@file_path, 'rb') do |file|
|
||||
header = file.read(4)
|
||||
return header == "PK\x03\x04"
|
||||
end
|
||||
rescue => e
|
||||
Rails.logger.error "Error checking if file is zip: #{e.message}"
|
||||
false
|
||||
end
|
||||
|
||||
def import_from_zip
|
||||
require 'zip'
|
||||
require 'stringio'
|
||||
|
||||
Rails.logger.info "Processing zip file directly: #{@file_path}"
|
||||
|
||||
# Read the entire ZIP file content into memory first
|
||||
zip_content = File.binread(@file_path)
|
||||
|
||||
Zip::File.open_buffer(StringIO.new(zip_content)) do |zip_file|
|
||||
# First, see what's in the zip
|
||||
Rails.logger.info "Files in zip:"
|
||||
zip_file.each do |entry|
|
||||
Rails.logger.info " - #{entry.name} (#{entry.size} bytes)"
|
||||
end
|
||||
|
||||
# First, load location data from zip
|
||||
load_locations_data_from_zip(zip_file)
|
||||
|
||||
# Then process block files from zip
|
||||
zip_file.each do |entry|
|
||||
if entry.name.include?('Blocks') && entry.name.end_with?('.csv')
|
||||
Rails.logger.info "Processing block file from zip: #{entry.name}"
|
||||
process_csv_from_zip(zip_file, entry)
|
||||
end
|
||||
end
|
||||
end
|
||||
rescue => e
|
||||
Rails.logger.error "Error processing ZIP file: #{e.message}"
|
||||
Rails.logger.error e.backtrace.join("\n")
|
||||
raise
|
||||
end
|
||||
|
||||
def process_csv_from_zip(zip_file, entry)
|
||||
zip_file.get_input_stream(entry) do |io|
|
||||
# Read the entire content from the stream
|
||||
content = io.read
|
||||
|
||||
# Try different encodings if UTF-8 fails
|
||||
encodings = ['UTF-8', 'ISO-8859-1', 'Windows-1252']
|
||||
encoding_used = nil
|
||||
|
||||
encodings.each do |encoding|
|
||||
begin
|
||||
# Parse the CSV content from the string
|
||||
CSV.parse(content, headers: true, header_converters: :symbol, encoding: encoding) do |row|
|
||||
@total_records += 1
|
||||
|
||||
begin
|
||||
import_record(row)
|
||||
@processed_records += 1
|
||||
rescue => e
|
||||
@failed_records += 1
|
||||
@errors << "Row #{@total_records}: #{e.message} - Data: #{row.to_h}"
|
||||
end
|
||||
|
||||
update_progress_if_needed
|
||||
end
|
||||
encoding_used = encoding
|
||||
Rails.logger.info "Successfully processed #{entry.name} with #{encoding} encoding"
|
||||
break
|
||||
rescue CSV::InvalidEncodingError => e
|
||||
Rails.logger.warn "Failed to process #{entry.name} with #{encoding} encoding: #{e.message}"
|
||||
next if encoding != encodings.last
|
||||
raise e if encoding == encodings.last
|
||||
end
|
||||
end
|
||||
|
||||
unless encoding_used
|
||||
@errors << "Failed to process #{entry.name} with any supported encoding"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def load_locations_data_from_zip(zip_file)
|
||||
require 'zip'
|
||||
|
||||
# Find all location files and prioritize English
|
||||
location_entries = zip_file.select { |entry| entry.name.include?('Locations') && entry.name.end_with?('.csv') }
|
||||
|
||||
# Sort to prioritize English locations file
|
||||
location_entries.sort_by! { |entry| entry.name.include?('Locations-en') ? 0 : 1 }
|
||||
|
||||
location_entries.each do |entry|
|
||||
Rails.logger.info "Loading locations from: #{entry.name}"
|
||||
zip_file.get_input_stream(entry) do |io|
|
||||
# Read the entire content from the stream
|
||||
content = io.read
|
||||
|
||||
# Try different encodings if UTF-8 fails
|
||||
encodings = ['UTF-8', 'ISO-8859-1', 'Windows-1252']
|
||||
|
||||
encodings.each do |encoding|
|
||||
begin
|
||||
# Parse the CSV content from the string
|
||||
CSV.parse(content, headers: true, header_converters: :symbol, encoding: encoding) do |row|
|
||||
geoname_id = row[:geoname_id]
|
||||
next unless geoname_id
|
||||
|
||||
@locations_cache[geoname_id] = {
|
||||
country_iso_code: row[:country_iso_code],
|
||||
country_name: row[:country_name],
|
||||
continent_code: row[:continent_code],
|
||||
continent_name: row[:continent_name],
|
||||
is_in_european_union: row[:is_in_european_union]
|
||||
}
|
||||
end
|
||||
Rails.logger.info "Loaded locations from #{entry.name} with #{encoding} encoding"
|
||||
break
|
||||
rescue CSV::InvalidEncodingError => e
|
||||
Rails.logger.warn "Failed to load locations from #{entry.name} with #{encoding} encoding: #{e.message}"
|
||||
next if encoding != encodings.last
|
||||
raise e if encoding == encodings.last
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Rails.logger.info "Loaded #{@locations_cache.size} location records"
|
||||
end
|
||||
|
||||
def import_csv_file(csv_file)
|
||||
CSV.foreach(csv_file, headers: true, header_converters: :symbol, encoding: 'UTF-8') do |row|
|
||||
@total_records += 1
|
||||
|
||||
begin
|
||||
import_record(row)
|
||||
@processed_records += 1
|
||||
rescue => e
|
||||
@failed_records += 1
|
||||
@errors << "Row #{@total_records}: #{e.message} - Data: #{row.to_h}"
|
||||
|
||||
# Update progress every 100 records or on error
|
||||
update_progress_if_needed
|
||||
end
|
||||
|
||||
update_progress_if_needed
|
||||
end
|
||||
end
|
||||
|
||||
def import_record(row)
|
||||
network = row[:network]
|
||||
geoname_id = row[:geoname_id]
|
||||
registered_country_geoname_id = row[:registered_country_geoname_id]
|
||||
is_anonymous_proxy = row[:is_anonymous_proxy] == '1'
|
||||
is_satellite_provider = row[:is_satellite_provider] == '1'
|
||||
is_anycast = row[:is_anycast] == '1'
|
||||
|
||||
unless network
|
||||
raise "Missing required field: network"
|
||||
end
|
||||
|
||||
# Validate network format
|
||||
IPAddr.new(network) # This will raise if invalid
|
||||
|
||||
# Get location data - prefer geoname_id, then registered_country_geoname_id
|
||||
location_data = @locations_cache[geoname_id] || @locations_cache[registered_country_geoname_id] || {}
|
||||
|
||||
additional_data = {
|
||||
geoname_id: geoname_id,
|
||||
registered_country_geoname_id: registered_country_geoname_id,
|
||||
represented_country_geoname_id: row[:represented_country_geoname_id],
|
||||
continent_code: location_data[:continent_code],
|
||||
continent_name: location_data[:continent_name],
|
||||
country_name: location_data[:country_name],
|
||||
is_in_european_union: location_data[:is_in_european_union],
|
||||
is_satellite_provider: is_satellite_provider,
|
||||
is_anycast: is_anycast
|
||||
}.compact
|
||||
|
||||
NetworkRange.upsert(
|
||||
{
|
||||
network: network,
|
||||
country: location_data[:country_iso_code],
|
||||
is_proxy: is_anonymous_proxy,
|
||||
source: 'geolite_country',
|
||||
additional_data: additional_data,
|
||||
updated_at: Time.current
|
||||
},
|
||||
unique_by: :index_network_ranges_on_network_unique
|
||||
)
|
||||
end
|
||||
|
||||
def update_progress_if_needed
|
||||
if (@processed_records + @failed_records) % 100 == 0
|
||||
@data_import.update_progress(
|
||||
processed: @processed_records,
|
||||
failed: @failed_records,
|
||||
total_records: @total_records,
|
||||
stats: {
|
||||
total_records: @total_records,
|
||||
current_file: File.basename(@file_path),
|
||||
locations_loaded: @locations_cache.size,
|
||||
recent_errors: @errors.last(5)
|
||||
}
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def load_locations_data
|
||||
locations_files = find_locations_files
|
||||
|
||||
locations_files.each do |locations_file|
|
||||
CSV.foreach(locations_file, headers: true, header_converters: :symbol, encoding: 'UTF-8') do |row|
|
||||
geoname_id = row[:geoname_id]
|
||||
next unless geoname_id
|
||||
|
||||
@locations_cache[geoname_id] = {
|
||||
country_iso_code: row[:country_iso_code],
|
||||
country_name: row[:country_name],
|
||||
continent_code: row[:continent_code],
|
||||
continent_name: row[:continent_name],
|
||||
is_in_european_union: row[:is_in_european_union]
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def find_locations_files
|
||||
if @file_path.end_with?('.zip')
|
||||
base_dir = File.dirname(@file_path)
|
||||
base_name = File.basename(@file_path, '.zip')
|
||||
|
||||
# Look for English locations file first, then any locations file
|
||||
[
|
||||
File.join(base_dir, "#{base_name}-Locations-en.csv"),
|
||||
Dir[File.join(base_dir, "#{base_name}-Locations-*.csv")].first
|
||||
].compact.select { |file| File.exist?(file) }
|
||||
else
|
||||
base_dir = File.dirname(@file_path)
|
||||
Dir[File.join(base_dir, "*Locations*.csv")].select { |file| File.exist?(file) }
|
||||
end
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user