FEATURE: Import script for AnswerBase

Improves the generic database used by some import scripts: * Adds additional columns for users * Adds support for attachments * Allows setting the data type for keys (numeric or string) to ensure correct sorting
2025-09-06 10:50:21 +08:00 · 2019-02-28 21:59:36 +01:00 · 2019-02-28 21:59:36 +01:00 · c36c9c2ee5
commit c36c9c2ee5
parent ff26b4ed9b
5 changed files with 471 additions and 40 deletions
--- a/script/import_scripts/answerbase.rb
+++ b/script/import_scripts/answerbase.rb
@ -0,0 +1,341 @@
+require 'csv'
+require 'reverse_markdown'
+require_relative 'base'
+require_relative 'base/generic_database'
+
+# Call it like this:
+#   RAILS_ENV=production bundle exec ruby script/import_scripts/answerbase.rb DIRNAME
+class ImportScripts::Answerbase < ImportScripts::Base
+  OLD_DOMAIN = "http://answerbase.example.com" # without trailing slash
+  NEW_DOMAIN = "https://discourse.example.com"
+  AVATAR_DIRECTORY = "User Images"
+  ANSWER_ATTACHMENT_DIRECTORY = "Answer Attachments"
+  ANSWER_IMAGE_DIRECTORY = "Answer Images"
+  QUESTION_ATTACHMENT_DIRECTORY = "Question Attachments"
+  QUESTION_IMAGE_DIRECTORY = "Question Images"
+  EMBEDDED_IMAGE_REGEX = /<a[^>]*href="[^"]*relativeUrl=(?<path>[^"\&]*)[^"]*"[^>]*>\s*<img[^>]*>\s*<\/a>/i
+  QUESTION_LINK_REGEX = /<a[^>]*?href="#{Regexp.escape(OLD_DOMAIN)}\/[^"]*?(?:q|questionid=)(?<id>\d+)[^"]*?"[^>]*>(?<text>.*?)<\/a>/i
+  TOPIC_LINK_NORMALIZATION = '/.*?-(q\d+).*/\1'
+  BATCH_SIZE = 1000
+
+  def initialize(path)
+    super()
+
+    @path = path
+    @db = ImportScripts::GenericDatabase.new(
+      @path,
+      batch_size: BATCH_SIZE,
+      recreate: true,
+      numeric_keys: true
+    )
+  end
+
+  def execute
+    read_csv_files
+
+    add_permalink_normalizations
+    import_categories
+    import_users
+    import_topics
+    import_posts
+  end
+
+  def read_csv_files
+    puts "", "reading CSV files..."
+
+    category_position = 0
+    csv_parse("categories") do |row|
+      @db.insert_category(
+        id: row[:id],
+        name: row[:name],
+        position: category_position += 1
+      )
+    end
+
+    csv_parse("users") do |row|
+      @db.insert_user(
+        id: row[:id],
+        email: row[:email],
+        username: row[:username],
+        bio: row[:description],
+        avatar_path: row[:profile_image],
+        created_at: parse_date(row[:createtime]),
+        active: true
+      )
+    end
+
+    last_topic_id = nil
+    csv_parse("questions-answers-comments") do |row|
+      next if row[:published] == "No"
+      user_id = @db.get_user_id(row[:username])
+      created_at = parse_datetime(row[:createtime])
+
+      begin
+        if row[:type] == "Question"
+          attachments = parse_filenames(row[:attachments], QUESTION_ATTACHMENT_DIRECTORY) +
+            parse_filenames(row[:images], QUESTION_IMAGE_DIRECTORY)
+
+          @db.insert_topic(
+            id: row[:id],
+            title: row[:title],
+            raw: row[:text],
+            category_id: row[:categorylist],
+            user_id: user_id,
+            created_at: created_at,
+            attachments: attachments
+          )
+          last_topic_id = row[:id]
+        else
+          attachments = parse_filenames(row[:attachments], ANSWER_ATTACHMENT_DIRECTORY) +
+            parse_filenames(row[:images], ANSWER_IMAGE_DIRECTORY)
+
+          @db.insert_post(
+            id: row[:id],
+            raw: row[:text],
+            topic_id: last_topic_id,
+            user_id: user_id,
+            created_at: created_at,
+            attachments: attachments
+          )
+        end
+      rescue
+        p row
+        raise
+      end
+    end
+  end
+
+  def parse_filenames(text, directory)
+    return [] if text.blank?
+
+    text
+      .split(';')
+      .map { |filename| File.join(@path, directory, filename.strip) }
+  end
+
+  def parse_date(text)
+    return nil if text.blank?
+    DateTime.strptime(text, "%m/%d/%y")
+  end
+
+  def parse_datetime(text)
+    return nil if text.blank?
+    # DateTime.strptime(text, "%m/%d/%Y %H:%M")
+    DateTime.parse(text).utc.to_datetime
+  end
+
+  def import_categories
+    puts "", "creating categories"
+    rows = @db.fetch_categories
+
+    create_categories(rows) do |row|
+      {
+        id: row['id'],
+        name: row['name'],
+        description: row['description'],
+        position: row['position']
+      }
+    end
+  end
+
+  def batches
+    super(BATCH_SIZE)
+  end
+
+  def import_users
+    puts "", "creating users"
+    total_count = @db.count_users
+    last_id = 0
+
+    batches do |offset|
+      rows, last_id = @db.fetch_users(last_id)
+      break if rows.empty?
+
+      next if all_records_exist?(:users, rows.map { |row| row['id'] })
+
+      create_users(rows, total: total_count, offset: offset) do |row|
+        {
+          id: row['id'],
+          email: row['email'],
+          username: row['username'],
+          bio_raw: row['bio'],
+          created_at: row['created_at'],
+          active: row['active'] == 1,
+          post_create_action: proc do |user|
+            create_avatar(user, row['avatar_path'])
+          end
+        }
+      end
+    end
+  end
+
+  def create_avatar(user, avatar_path)
+    return if avatar_path.blank?
+    avatar_path = File.join(@path, AVATAR_DIRECTORY, avatar_path)
+
+    if File.exist?(avatar_path)
+      @uploader.create_avatar(user, avatar_path)
+    else
+      STDERR.puts "Could not find avatar: #{avatar_path}"
+    end
+  end
+
+  def import_topics
+    puts "", "creating topics"
+    total_count = @db.count_topics
+    last_id = 0
+
+    batches do |offset|
+      rows, last_id = @db.fetch_topics(last_id)
+      break if rows.empty?
+
+      next if all_records_exist?(:posts, rows.map { |row| row['id'] })
+
+      create_posts(rows, total: total_count, offset: offset) do |row|
+        attachments = @db.fetch_topic_attachments(row['id']) if row['upload_count'] > 0
+        user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id
+
+        {
+          id: row['id'],
+          title: row['title'],
+          raw: raw_with_attachments(row['raw'].presence || row['title'], attachments, user_id),
+          category: category_id_from_imported_category_id(row['category_id']),
+          user_id: user_id,
+          created_at: row['created_at'],
+          closed: row['closed'] == 1,
+          post_create_action: proc do |post|
+            url = "q#{row['id']}"
+            Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
+          end
+        }
+      end
+    end
+  end
+
+  def import_posts
+    puts "", "creating posts"
+    total_count = @db.count_posts
+    last_row_id = 0
+
+    batches do |offset|
+      rows, last_row_id = @db.fetch_posts(last_row_id)
+      break if rows.empty?
+
+      next if all_records_exist?(:posts, rows.map { |row| row['id'] })
+
+      create_posts(rows, total: total_count, offset: offset) do |row|
+        topic = topic_lookup_from_imported_post_id(row['topic_id'])
+        attachments = @db.fetch_post_attachments(row['id']) if row['upload_count'] > 0
+        user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id
+
+        {
+          id: row['id'],
+          raw: raw_with_attachments(row['raw'], attachments, user_id),
+          user_id: user_id,
+          topic_id: topic[:topic_id],
+          created_at: row['created_at']
+        }
+      end
+    end
+  end
+
+  def raw_with_attachments(raw, attachments, user_id)
+    raw, embedded_paths, upload_ids = replace_embedded_attachments(raw, user_id)
+    raw = replace_question_links(raw)
+    raw = ReverseMarkdown.convert(raw) || ""
+
+    attachments&.each do |attachment|
+      path = attachment['path']
+      next if embedded_paths.include?(path)
+
+      if File.exist?(path)
+        filename = File.basename(path)
+        upload = @uploader.create_upload(user_id, path, filename)
+
+        if upload.present? && upload.persisted? && !upload_ids.include?(upload.id)
+          raw << "\n" << @uploader.html_for_upload(upload, filename)
+        end
+      else
+        STDERR.puts "Could not find file: #{path}"
+      end
+    end
+
+    raw
+  end
+
+  def replace_embedded_attachments(raw, user_id)
+    paths = []
+    upload_ids = []
+
+    raw = raw.gsub(EMBEDDED_IMAGE_REGEX) do
+      path = File.join(@path, Regexp.last_match['path'])
+      filename = File.basename(path)
+      path = find_image_path(filename)
+
+      if path
+        upload = @uploader.create_upload(user_id, path, filename)
+
+        if upload.present? && upload.persisted?
+          paths << path
+          upload_ids << upload.id
+          @uploader.html_for_upload(upload, filename)
+        end
+      else
+        STDERR.puts "Could not find file: #{path}"
+      end
+    end
+
+    [raw, paths, upload_ids]
+  end
+
+  def find_image_path(filename)
+    [QUESTION_IMAGE_DIRECTORY, ANSWER_IMAGE_DIRECTORY].each do |directory|
+      path = File.join(@path, directory, filename)
+      return path if File.exist?(path)
+    end
+  end
+
+  def replace_question_links(raw)
+    raw.gsub(QUESTION_LINK_REGEX) do
+      topic_id = Regexp.last_match("id")
+      topic = topic_lookup_from_imported_post_id(topic_id)
+      return Regexp.last_match.to_s unless topic
+
+      url = File.join(NEW_DOMAIN, topic[:url])
+      text = Regexp.last_match("text")
+      text.include?(OLD_DOMAIN) ? url : "<a href='#{url}'>#{text}</a>"
+    end
+  end
+
+  def add_permalink_normalizations
+    normalizations = SiteSetting.permalink_normalizations
+    normalizations = normalizations.blank? ? [] : normalizations.split('|')
+
+    add_normalization(normalizations, TOPIC_LINK_NORMALIZATION)
+
+    SiteSetting.permalink_normalizations = normalizations.join('|')
+  end
+
+  def add_normalization(normalizations, normalization)
+    normalizations << normalization unless normalizations.include?(normalization)
+  end
+
+  def permalink_exists?(url)
+    Permalink.find_by(url: url)
+  end
+
+  def csv_parse(table_name)
+    CSV.foreach(File.join(@path, "#{table_name}.csv"),
+                headers: true,
+                header_converters: :symbol,
+                skip_blanks: true,
+                encoding: 'bom|utf-8') { |row| yield row }
+  end
+end
+
+unless ARGV[0] && Dir.exist?(ARGV[0])
+  puts "", "Usage:", "", "bundle exec ruby script/import_scripts/answerbase.rb DIRNAME", ""
+  exit 1
+end
+
+ImportScripts::Answerbase.new(ARGV[0]).perform