From 251cac39af71b66428d2c5ca5623e299689c8e55 Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Mon, 7 Apr 2025 17:06:20 +0200 Subject: [PATCH] DEV: Adds a basic importer for the IntermediateDB * It only imports users and emails so far * It stores mapped IDs and usernames in a SQLite DB. In the future, we might want to copy those into the Discourse DB at the end of a migration. * The importer is split into steps which can mostly be configured with a simple DSL * Data that needs to be shared between steps can be stored in an instance of the `SharedData` class * Steps are automatically sorted via their defined dependencies before they are executed * Common logic for finding unique names (username, group name) is extracted into a helper class * If possible, steps try to avoid loading already imported data (via `mapping.ids` table) * And steps should select the `discourse_id` instead of the `original_id` of mapped IDs via SQL --- migrations/config/importer.yml | 2 + migrations/config/locales/migrations.en.yml | 5 + .../db/mappings_db_schema/001-mappings.sql | 7 + .../db/mappings_db_schema/002-usernames.sql | 9 + migrations/lib/cli/import_command.rb | 5 +- migrations/lib/database.rb | 1 + migrations/lib/importer.rb | 13 ++ migrations/lib/importer/.gitkeep | 0 migrations/lib/importer/copy_step.rb | 166 ++++++++++++++++++ migrations/lib/importer/discourse_db.rb | 115 ++++++++++++ migrations/lib/importer/executor.rb | 63 +++++++ migrations/lib/importer/mapping_type.rb | 7 + migrations/lib/importer/shared_data.rb | 51 ++++++ migrations/lib/importer/step.rb | 116 ++++++++++++ migrations/lib/importer/step_stats.rb | 12 ++ migrations/lib/importer/steps/.gitkeep | 0 migrations/lib/importer/steps/user_emails.rb | 39 ++++ migrations/lib/importer/steps/users.rb | 3 +- migrations/lib/importer/topological_sorter.rb | 50 ++++++ migrations/lib/importer/unique_name_finder.rb | 84 +++++++++ 20 files changed, 744 insertions(+), 4 deletions(-) create mode 100644 migrations/config/importer.yml create mode 100644 migrations/db/mappings_db_schema/001-mappings.sql create mode 100644 migrations/db/mappings_db_schema/002-usernames.sql create mode 100644 migrations/lib/importer.rb delete mode 100644 migrations/lib/importer/.gitkeep create mode 100644 migrations/lib/importer/copy_step.rb create mode 100644 migrations/lib/importer/discourse_db.rb create mode 100644 migrations/lib/importer/executor.rb create mode 100644 migrations/lib/importer/mapping_type.rb create mode 100644 migrations/lib/importer/shared_data.rb create mode 100644 migrations/lib/importer/step.rb create mode 100644 migrations/lib/importer/step_stats.rb delete mode 100644 migrations/lib/importer/steps/.gitkeep create mode 100644 migrations/lib/importer/steps/user_emails.rb create mode 100644 migrations/lib/importer/topological_sorter.rb create mode 100644 migrations/lib/importer/unique_name_finder.rb diff --git a/migrations/config/importer.yml b/migrations/config/importer.yml new file mode 100644 index 00000000000..c085cda8acf --- /dev/null +++ b/migrations/config/importer.yml @@ -0,0 +1,2 @@ +intermediate_db: /shared/import/intermediate.db +mappings_db: /shared/import/mappings.db diff --git a/migrations/config/locales/migrations.en.yml b/migrations/config/locales/migrations.en.yml index fa194f82c2b..f2c1f2275ae 100644 --- a/migrations/config/locales/migrations.en.yml +++ b/migrations/config/locales/migrations.en.yml @@ -19,6 +19,11 @@ en: default_step_title: "Converting %{type}" max_progress_calculation: "Calculating items took %{duration}" + importer: + default_step_title: "Importing %{type}" + loading_required_data: "Loading required data..." + done: "Done. Total runtime: %{runtime}" + schema: validator: include_exclude_not_allowed: "Cannot use `include` and `exclude` together at %{path}" diff --git a/migrations/db/mappings_db_schema/001-mappings.sql b/migrations/db/mappings_db_schema/001-mappings.sql new file mode 100644 index 00000000000..ca22f76493d --- /dev/null +++ b/migrations/db/mappings_db_schema/001-mappings.sql @@ -0,0 +1,7 @@ +CREATE TABLE ids +( + original_id NUMERIC NOT NULL, + type INTEGER NOT NULL, + discourse_id NUMERIC NOT NULL, + PRIMARY KEY (original_id, type) +); diff --git a/migrations/db/mappings_db_schema/002-usernames.sql b/migrations/db/mappings_db_schema/002-usernames.sql new file mode 100644 index 00000000000..88b5f3e7ca8 --- /dev/null +++ b/migrations/db/mappings_db_schema/002-usernames.sql @@ -0,0 +1,9 @@ +CREATE TABLE usernames +( + discourse_user_id NUMERIC NOT NULL, + original_username TEXT NOT NULL, + discourse_username TEXT NOT NULL, + PRIMARY KEY (discourse_user_id) +); + +CREATE INDEX usernames_original_username ON usernames (original_username); diff --git a/migrations/lib/cli/import_command.rb b/migrations/lib/cli/import_command.rb index 5365bffc276..44465aa784b 100644 --- a/migrations/lib/cli/import_command.rb +++ b/migrations/lib/cli/import_command.rb @@ -9,10 +9,9 @@ module Migrations::CLI end def execute - ::Migrations.load_rails_environment + ::Migrations.load_rails_environment(quiet: true) - puts "Importing into Discourse #{Discourse::VERSION::STRING}" - puts "Extralite SQLite version: #{Extralite.sqlite3_version}" + ::Migrations::Importer.execute end end end diff --git a/migrations/lib/database.rb b/migrations/lib/database.rb index fac8428eeed..2b5946e34f6 100644 --- a/migrations/lib/database.rb +++ b/migrations/lib/database.rb @@ -8,6 +8,7 @@ require "oj" module Migrations module Database INTERMEDIATE_DB_SCHEMA_PATH = File.join(::Migrations.root_path, "db", "intermediate_db_schema") + MAPPINGS_DB_SCHEMA_PATH = File.join(::Migrations.root_path, "db", "mappings_db_schema") UPLOADS_DB_SCHEMA_PATH = File.join(::Migrations.root_path, "db", "uploads_db_schema") def self.migrate(db_path, migrations_path:) diff --git a/migrations/lib/importer.rb b/migrations/lib/importer.rb new file mode 100644 index 00000000000..99db3193501 --- /dev/null +++ b/migrations/lib/importer.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +module Migrations + module Importer + def self.execute + config_path = File.join(::Migrations.root_path, "config", "importer.yml") + config = YAML.load_file(config_path, symbolize_names: true) + + executor = Executor.new(config) + executor.start + end + end +end diff --git a/migrations/lib/importer/.gitkeep b/migrations/lib/importer/.gitkeep deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/migrations/lib/importer/copy_step.rb b/migrations/lib/importer/copy_step.rb new file mode 100644 index 00000000000..8b7d50aad44 --- /dev/null +++ b/migrations/lib/importer/copy_step.rb @@ -0,0 +1,166 @@ +# frozen_string_literal: true + +module Migrations::Importer + class CopyStep < Step + MappingType = ::Migrations::Importer::MappingType + + NOW = "NOW()" + SYSTEM_USER_ID = Discourse::SYSTEM_USER_ID + + INSERT_MAPPED_IDS_SQL = <<~SQL + INSERT INTO mapped.ids (original_id, type, discourse_id) + VALUES (?, ?, ?) + SQL + + class << self + # stree-ignore + def table_name(value = (getter = true; nil)) + return @table_name if getter + @table_name = value + end + + # stree-ignore + def column_names(value = (getter = true; nil)) + return @column_names if getter + @column_names = value + end + + def timestamp_columns? + @timestamp_columns ||= + @column_names&.include?(:created_at) || @column_names&.include?(:updated_at) + end + + def store_mapped_ids(value) + @store_mapped_ids = value + end + + def store_mapped_ids? + !!@store_mapped_ids + end + + # stree-ignore + def total_rows_query(query = (getter = true; nil), *parameters) + return [@total_rows_query, @total_rows_query_parameters] if getter + + @total_rows_query = query + @total_rows_query_parameters = parameters + nil + end + + # stree-ignore + def rows_query(query = (getter = true; nil), *parameters) + return [@rows_query, @rows_query_parameters] if getter + + @rows_query = query + @rows_query_parameters = parameters + nil + end + end + + def initialize(intermediate_db, discourse_db, shared_data) + super + + @last_id = 0 + @mapping_type = nil + end + + def execute + super + with_progressbar(total_count) { copy_data } + nil + end + + private + + def copy_data + table_name = self.class.table_name || self.class.name&.demodulize&.underscore + column_names = self.class.column_names || @discourse_db.column_names(table_name) + skipped_rows = [] + + if self.class.store_mapped_ids? + @last_id = @discourse_db.last_id_of(table_name) + @mapping_type = find_mapping_type(table_name) + end + + @discourse_db.copy_data(table_name, column_names, fetch_rows(skipped_rows)) do |inserted_rows| + after_commit_of_inserted_rows(inserted_rows) + + if skipped_rows.any? + after_commit_of_skipped_rows(skipped_rows) + skipped_rows.clear + end + end + + @discourse_db.fix_last_id_of(table_name) if self.class.store_mapped_ids? + @intermediate_db.commit_transaction + end + + def fetch_rows(skipped_rows) + Enumerator.new do |enumerator| + query, parameters = self.class.rows_query + @intermediate_db.query(query, *parameters) do |row| + if (transformed_row = transform_row(row)) + enumerator << transformed_row + @stats.reset + else + skipped_rows << row + @stats.reset(skip_count: 1) + end + + update_progressbar + end + end + end + + def after_commit_of_inserted_rows(rows) + return unless self.class.store_mapped_ids? + + rows.each do |row| + @intermediate_db.insert(INSERT_MAPPED_IDS_SQL, [row[:original_id], @mapping_type, row[:id]]) + end + + nil + end + + def after_commit_of_skipped_rows(rows) + return unless self.class.store_mapped_ids? + + rows.each do |row| + if row[:id] && row[:original_id] + @intermediate_db.insert( + INSERT_MAPPED_IDS_SQL, + [row[:original_id], @mapping_type, row[:id]], + ) + end + end + + nil + end + + def transform_row(row) + row[:id] = (@last_id += 1) if self.class.store_mapped_ids? && row[:id].nil? + + if self.class.timestamp_columns? + row[:created_at] ||= NOW + row[:updated_at] = row[:created_at] + end + + row + end + + def find_mapping_type(table_name) + constant_name = table_name.to_s.upcase + + if MappingType.const_defined?(constant_name) + MappingType.const_get(constant_name) + else + raise "MappingType::#{constant_name} is not defined" + end + end + + def total_count + query, parameters = self.class.total_rows_query + @intermediate_db.count(query, *parameters) + end + end +end diff --git a/migrations/lib/importer/discourse_db.rb b/migrations/lib/importer/discourse_db.rb new file mode 100644 index 00000000000..8c921284f1a --- /dev/null +++ b/migrations/lib/importer/discourse_db.rb @@ -0,0 +1,115 @@ +# frozen_string_literal: true + +module Migrations::Importer + class DiscourseDB + COPY_BATCH_SIZE = 1_000 + + def initialize + @encoder = PG::TextEncoder::CopyRow.new + @connection = PG::Connection.new(database_configuration) + @connection.type_map_for_results = PG::BasicTypeMapForResults.new(@connection) + end + + def copy_data(table_name, column_names, rows) + quoted_column_name_list = column_names.map { |c| quote_identifier(c) }.join(",") + sql = "COPY #{table_name} (#{quoted_column_name_list}) FROM STDIN" + + rows.each_slice(COPY_BATCH_SIZE) do |sliced_rows| + # TODO Maybe add error handling and check if all rows fail to insert, or only + # some of them fail. Currently, if a single row fails to insert, then an exception + # will stop the whole import. Which seems fine because ideally the import script + # should ensure all data is valid. We might need to see how this works out in + # actual migrations... + @connection.transaction do + @connection.copy_data(sql, @encoder) do + sliced_rows.each do |row| + data = column_names.map { |c| row[c] } + @connection.put_copy_data(data) + end + end + + # give the caller a chance to do some work when a batch has been committed, + # for example, to store ID mappings + yield sliced_rows + end + end + + nil + end + + def last_id_of(table_name) + query = <<~SQL + SELECT COALESCE(MAX(id), 0) + FROM #{quote_identifier(table_name)} + WHERE id > 0 + SQL + + result = @connection.exec(query) + result.getvalue(0, 0) + end + + def fix_last_id_of(table_name) + table_name = quote_identifier(table_name) + query = <<~SQL + SELECT SETVAL(PG_GET_SERIAL_SEQUENCE('#{table_name}', 'id'), MAX(id)) + FROM #{table_name} + HAVING MAX(id) > 0 + SQL + + @connection.exec(query) + nil + end + + def column_names(table_name) + query = <<~SQL + SELECT column_name + FROM information_schema.columns + WHERE table_name = $1 + ORDER BY ordinal_position + SQL + + result = @connection.exec_params(query, [table_name]) + result.column_values(0).map(&:to_sym) + end + + def query_array(sql, *params) + @connection.send_query_params(sql, params) + @connection.set_single_row_mode + + Enumerator.new do |y| + while (result = @connection.get_result) + result.stream_each_row { |row| y.yield(row) } + result.clear + end + end + end + + def close + @connection.finish + end + + private + + def database_configuration + db_config = ActiveRecord::Base.connection_db_config.configuration_hash + + # credentials for PostgreSQL in CI environment + if Rails.env.test? + username = ENV["PGUSER"] + password = ENV["PGPASSWORD"] + end + + { + host: db_config[:host], + port: db_config[:port], + username: db_config[:username] || username, + password: db_config[:password] || password, + dbname: db_config[:database], + }.compact + end + + def quote_identifier(identifier) + PG::Connection.quote_ident(identifier.to_s) + end + end +end diff --git a/migrations/lib/importer/executor.rb b/migrations/lib/importer/executor.rb new file mode 100644 index 00000000000..1f1860d07a0 --- /dev/null +++ b/migrations/lib/importer/executor.rb @@ -0,0 +1,63 @@ +# frozen_string_literal: true + +module Migrations::Importer + class Executor + def initialize(config) + @intermediate_db = ::Migrations::Database.connect(config[:intermediate_db]) + @discourse_db = DiscourseDB.new + @shared_data = SharedData.new(@discourse_db) + + attach_mappings_db(config[:mappings_db]) + end + + def start + runtime = + ::Migrations::DateHelper.track_time do + execute_steps + ensure + cleanup + end + + puts I18n.t("importer.done", runtime: ::Migrations::DateHelper.human_readable_time(runtime)) + end + + private + + def attach_mappings_db(db_path) + # ::Migrations::Database.reset!(db_path) + ::Migrations::Database.migrate( + db_path, + migrations_path: ::Migrations::Database::MAPPINGS_DB_SCHEMA_PATH, + ) + @intermediate_db.execute("ATTACH DATABASE ? AS mapped", db_path) + end + + def step_classes + steps_module = ::Migrations::Importer::Steps + classes = + steps_module + .constants + .map { |c| steps_module.const_get(c) } + .select { |klass| klass.is_a?(Class) && klass < ::Migrations::Importer::Step } + TopologicalSorter.sort(classes) + end + + def execute_steps + max = step_classes.size + + step_classes + .each + .with_index(1) do |step_class, index| + puts "#{step_class.title} [#{index}/#{max}]" + step = step_class.new(@intermediate_db, @discourse_db, @shared_data) + step.execute + puts "" + end + end + + def cleanup + @intermediate_db.close + @discourse_db.close + end + end +end diff --git a/migrations/lib/importer/mapping_type.rb b/migrations/lib/importer/mapping_type.rb new file mode 100644 index 00000000000..5e3aef50615 --- /dev/null +++ b/migrations/lib/importer/mapping_type.rb @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +module Migrations::Importer + module MappingType + USERS = 1 + end +end diff --git a/migrations/lib/importer/shared_data.rb b/migrations/lib/importer/shared_data.rb new file mode 100644 index 00000000000..763d553163c --- /dev/null +++ b/migrations/lib/importer/shared_data.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +module Migrations::Importer + class SharedData + def initialize(discourse_db) + @discourse_db = discourse_db + end + + def load_set(sql) + @discourse_db.query_array(sql).map(&:first).to_set + end + + def load_mapping(sql) + rows = @discourse_db.query_array(sql) + + if rows.first && rows.first.size > 2 + rows.to_h { |key, *values| [key, *values] } + else + rows.to_h + end + end + + def load(type) + case type + when :usernames + @existing_usernames_lower ||= load_set <<~SQL + SELECT username_lower + FROM users + SQL + when :group_names + @existing_group_names_lower ||= load_set <<~SQL + SELECT LOWER(name) + FROM groups + SQL + else + raise "Unknown type: #{type}" + end + end + + def unload_shared_data(type) + case type + when :usernames + @existing_usernames_lower = nil + when :group_names + @existing_group_names_lower = nil + else + raise "Unknown type: #{type}" + end + end + end +end diff --git a/migrations/lib/importer/step.rb b/migrations/lib/importer/step.rb new file mode 100644 index 00000000000..9ff887c291b --- /dev/null +++ b/migrations/lib/importer/step.rb @@ -0,0 +1,116 @@ +# frozen_string_literal: true + +module Migrations::Importer + class Step + class << self + # stree-ignore + def title(value = (getter = true; nil)) + if getter + return( + @title ||= + I18n.t( + "importer.default_step_title", + type: name&.demodulize&.underscore&.humanize(capitalize: false), + ) + ) + end + + @title = value + end + + def depends_on(*step_names) + steps_module = ::Migrations::Importer::Steps + classes = + step_names.map do |name| + name = name.to_s.camelize + klass = steps_module.const_get(name) if steps_module.const_defined?(name) + + unless klass.is_a?(Class) && klass < ::Migrations::Importer::Step + raise NameError, "Class #{class_name} not found" + end + + klass + end + + @dependencies ||= [] + @dependencies.concat(classes) + end + + def dependencies + @dependencies || [] + end + + def requires_mapping(name, sql) + @required_mappings ||= {} + @required_mappings[name] = sql + end + + def required_mappings + @required_mappings || {} + end + + def requires_set(name, sql) + @required_sets ||= {} + @required_sets[name] = sql + end + + def required_sets + @required_sets || {} + end + end + + def initialize(intermediate_db, discourse_db, shared_data) + @intermediate_db = intermediate_db + @discourse_db = discourse_db + @shared_data = shared_data + + @stats = StepStats.new(skip_count: 0, warning_count: 0, error_count: 0) + end + + def execute + load_required_data + end + + private + + def load_required_data + required_mappings = self.class.required_mappings + required_sets = self.class.required_sets + return if required_mappings.blank? && required_sets.blank? + + print " #{I18n.t("importer.loading_required_data")} " + + runtime = + ::Migrations::DateHelper.track_time do + required_mappings.each do |name, sql| + instance_variable_set("@#{name}", @shared_data.load_mapping(sql)) + end + + required_sets.each do |name, sql| + instance_variable_set("@#{name}", @shared_data.load_set(sql)) + end + end + + puts ::Migrations::DateHelper.human_readable_time(runtime) if runtime >= 1 + end + + def update_progressbar(increment_by: 1) + @progressbar.update( + increment_by:, + skip_count: @stats.skip_count, + warning_count: @stats.warning_count, + error_count: @stats.error_count, + ) + end + + def with_progressbar(max_progress) + ::Migrations::ExtendedProgressBar + .new(max_progress:) + .run do |progressbar| + @progressbar = progressbar + yield + @progressbar = nil + end + end + end +end diff --git a/migrations/lib/importer/step_stats.rb b/migrations/lib/importer/step_stats.rb new file mode 100644 index 00000000000..5b64060a3d5 --- /dev/null +++ b/migrations/lib/importer/step_stats.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Migrations::Importer + StepStats = + Struct.new(:skip_count, :warning_count, :error_count) do + def reset(skip_count: 0, warning_count: 0, error_count: 0) + self.skip_count = skip_count + self.warning_count = warning_count + self.error_count = error_count + end + end +end diff --git a/migrations/lib/importer/steps/.gitkeep b/migrations/lib/importer/steps/.gitkeep deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/migrations/lib/importer/steps/user_emails.rb b/migrations/lib/importer/steps/user_emails.rb new file mode 100644 index 00000000000..58f486b7992 --- /dev/null +++ b/migrations/lib/importer/steps/user_emails.rb @@ -0,0 +1,39 @@ +# frozen_string_literal: true + +module Migrations::Importer::Steps + class UserEmails < ::Migrations::Importer::CopyStep + depends_on :users + + requires_set :existing_user_ids, "SELECT DISTINCT user_id FROM user_emails" + + column_names %i[user_id email primary created_at updated_at] + + total_rows_query <<~SQL, MappingType::USERS + SELECT COUNT(*) + FROM users u + JOIN mapped.ids mu ON u.original_id = mu.original_id AND mu.type = ? + LEFT JOIN user_emails ue ON u.original_id = ue.user_id + SQL + + rows_query <<~SQL, MappingType::USERS + SELECT mu.discourse_id AS user_id, + ue.email, + COALESCE(ue."primary", TRUE) AS "primary", + COALESCE(ue.created_at, u.created_at) AS created_at + FROM users u + JOIN mapped.ids mu ON u.original_id = mu.original_id AND mu.type = ? + LEFT JOIN user_emails ue ON u.original_id = ue.user_id + ORDER BY ue.ROWID + SQL + + private + + def transform_row(row) + return nil if @existing_user_ids.include?(row[:user_id]) + + row[:email] ||= "#{SecureRandom.hex}@email.invalid" + + super + end + end +end diff --git a/migrations/lib/importer/steps/users.rb b/migrations/lib/importer/steps/users.rb index 61170f859e9..c6d713adf63 100644 --- a/migrations/lib/importer/steps/users.rb +++ b/migrations/lib/importer/steps/users.rb @@ -60,9 +60,10 @@ module Migrations::Importer::Steps JSON_GROUP_ARRAY(LOWER(ue.email)) AS emails FROM users u LEFT JOIN user_emails ue ON u.original_id = ue.user_id + LEFT JOIN mapped.ids amu ON u.approved_by_id IS NOT NULL AND u.approved_by_id = amu.original_id AND amu.type = ?1 LEFT JOIN user_suspensions us ON u.original_id = us.user_id AND us.suspended_at < DATETIME() AND (us.suspended_till IS NULL OR us.suspended_till > DATETIME()) - LEFT JOIN mapped.ids mu ON u.original_id = mu.original_id AND mu.type = ? + LEFT JOIN mapped.ids mu ON u.original_id = mu.original_id AND mu.type = ?1 WHERE mu.original_id IS NULL GROUP BY u.original_id ORDER BY u.ROWID diff --git a/migrations/lib/importer/topological_sorter.rb b/migrations/lib/importer/topological_sorter.rb new file mode 100644 index 00000000000..7d1bab0d79a --- /dev/null +++ b/migrations/lib/importer/topological_sorter.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +module Migrations::Importer + class TopologicalSorter + def self.sort(classes) + new(classes).sort + end + + def initialize(classes) + @classes = classes + @dependency_graph = build_dependency_graph + end + + def sort + in_degree = Hash.new(0) + @dependency_graph.each_value { |edges| edges.each { |edge| in_degree[edge] += 1 } } + + queue = @classes.reject { |cls| in_degree[cls] > 0 } + result = [] + + while queue.any? + node = queue.shift + result << node + + @dependency_graph[node].each do |child| + in_degree[child] -= 1 + queue << child if in_degree[child] == 0 + end + end + + raise "Circular dependency detected" if result.size < @classes.size + + result + end + + private + + def build_dependency_graph + graph = Hash.new { |hash, key| hash[key] = [] } + @classes + .sort_by(&:to_s) + .each do |klass| + dependencies = klass.dependencies || [] + dependencies.each { |dependency| graph[dependency] << klass } + graph[klass] ||= [] + end + graph + end + end +end diff --git a/migrations/lib/importer/unique_name_finder.rb b/migrations/lib/importer/unique_name_finder.rb new file mode 100644 index 00000000000..aecf68480af --- /dev/null +++ b/migrations/lib/importer/unique_name_finder.rb @@ -0,0 +1,84 @@ +# frozen_string_literal: true + +module Migrations::Importer + class UniqueNameFinder + MAX_USERNAME_LENGTH = 60 + + def initialize(shared_data) + @used_usernames_lower = shared_data.load(:usernames) + @used_group_names_lower = shared_data.load(:group_names) + @last_suffixes = {} + + @fallback_username = + UserNameSuggester.sanitize_username(I18n.t("fallback_username")).presence || + UserNameSuggester::LAST_RESORT_USERNAME + @fallback_group_name = "group" + end + + def find_available_username(username, allow_reserved_username: false) + username, username_lower = + find_available_name( + username, + fallback_name: @fallback_username, + max_name_length: MAX_USERNAME_LENGTH, + allow_reserved_username:, + ) + + @used_usernames_lower.add(username_lower) + username + end + + def find_available_group_name(group_name) + group_name, group_name_lower = + find_available_name(group_name, fallback_name: @fallback_group_name) + + @used_group_names_lower.add(group_name_lower) + group_name + end + + private + + def name_available?(name, allow_reserved_username: false) + name_lower = name.downcase + + return false if @used_usernames_lower.include?(name_lower) + return false if @used_group_names_lower.include?(name_lower) + return false if !allow_reserved_username && User.reserved_username?(name_lower) + true + end + + def find_available_name( + name, + fallback_name:, + max_name_length: nil, + allow_reserved_username: false + ) + name = name.unicode_normalize + name = UserNameSuggester.sanitize_username(name) + name = fallback_name.dup if name.blank? + name = UserNameSuggester.truncate(name, max_name_length) if max_name_length + + if !name_available?(name, allow_reserved_username:) + # if the name ends with a number, then use an underscore before appending the suffix + suffix_separator = name.match?(/\d$/) ? "_" : "" + suffix = next_suffix(name).to_s + + # TODO This needs better logic, because it's possible that the max username length is exceeded + name = +"#{name}#{suffix_separator}#{suffix}" + name.next! until name_available?(name, allow_reserved_username:) + end + + [name, name.downcase] + end + + def next_suffix(name) + name_lower = name.downcase + @last_suffixes.fetch(name_lower, 0) + 1 + end + + def store_last_suffix(name) + name_lower = name.downcase + @last_suffixes[$1] = $2.to_i if name_lower =~ /^(.+?)(\d+)$/ + end + end +end