Skip to content

Commit

Permalink
Merge pull request #13 from notch8/csv_importer
Browse files Browse the repository at this point in the history
CSV Importer
  • Loading branch information
Julie Allinson authored Jul 11, 2019
2 parents c4a84d6 + 1d4f355 commit 3aad371
Show file tree
Hide file tree
Showing 25 changed files with 359 additions and 48 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ gem "blacklight", "~> 6.20.0"

group :development, :test do
gem 'solr_wrapper', '>= 0.3'
gem 'database_cleaner'
end

gem 'rsolr', '>= 1.0'
Expand Down
4 changes: 3 additions & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ GEM
concurrent-ruby (1.1.5)
connection_pool (2.2.2)
crass (1.0.4)
database_cleaner (1.7.0)
declarative (0.0.10)
declarative-option (0.1.0)
deprecation (1.0.0)
Expand Down Expand Up @@ -710,6 +711,7 @@ DEPENDENCIES
bootstrap-sass (~> 3.0)
bulkrax!
byebug
database_cleaner
factory_bot_rails
hyrax (~> 2.3)
oai
Expand All @@ -721,4 +723,4 @@ DEPENDENCIES
twitter-typeahead-rails (= 0.11.1.pre.corejavascript)

BUNDLED WITH
1.17.2
1.17.3
2 changes: 1 addition & 1 deletion app/assets/javascripts/bulkrax/importers.js.erb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ function prepBulkrax(event) {
var external_set_select = $("#importer_parser_fields_set")
var initial_base_url = base_url.val()

// handle refreshing/loading of external setes via button click
// handle refreshing/loading of external sets via button click
$('body').on('click', '.refresh-set-source', function(e) {
e.preventDefault()

Expand Down
32 changes: 20 additions & 12 deletions app/factories/bulkrax/object_factory.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# TODO require 'importer/log_subscriber'
# TODO: require 'importer/log_subscriber'
module Bulkrax
class ObjectFactory
extend ActiveModel::Callbacks
Expand All @@ -16,9 +16,9 @@ def initialize(attributes, files_dir = nil, files = [], user = nil)
def run
arg_hash = { id: attributes[:id], name: 'UPDATE', klass: klass }
@object = find
if @object
@object.reindex_extent = Hyrax::Adapters::NestingIndexAdapter::LIMITED_REINDEX
ActiveSupport::Notifications.instrument('import.importer', arg_hash) { update }
if @object
@object.reindex_extent = Hyrax::Adapters::NestingIndexAdapter::LIMITED_REINDEX
ActiveSupport::Notifications.instrument('import.importer', arg_hash) { update }
else
ActiveSupport::Notifications.instrument('import.importer', arg_hash.merge(name: 'CREATE')) { create }
end
Expand All @@ -28,6 +28,7 @@ def run

def update
raise "Object doesn't exist" unless object

run_callbacks(:save) do
work_actor.update(environment(update_attributes))
end
Expand Down Expand Up @@ -106,16 +107,19 @@ def create_collection(attrs)
# a way that is compatible with how the factory needs them.
def transform_attributes
attributes.slice(*permitted_attributes)
.merge(file_attributes)
.merge(file_attributes)
end

# Find existing file or upload new file. This assumes a Work will have unique file titles;
# Find existing files or upload new files. This assumes a Work will have unique file titles;
# and that those file titles will not have changed
# could filter by URIs instead (slower).
# When an uploaded_file already exists we do not want to pass its id in `file_attributes`
# otherwise it gets reuploaded by `work_actor`.
# support multiple files; ensure attributes[:file] is an Array
def upload_ids
work_files_titles = object.file_sets.map(&:title) if object.present? && object.file_sets.present?
work_files_titles && work_files_titles.include?(attributes[:file]) ? [] : [import_file(file_paths.first)]
attributes[:file] = Array.wrap(attributes[:file])
work_files_titles = object.file_sets.map { |t| t.title.to_a }.flatten if object.present? && object.file_sets.present?
work_files_titles && (work_files_titles & attributes[:file]).present? ? [] : import_files
end

def file_attributes
Expand All @@ -127,17 +131,21 @@ def file_attributes

def new_remote_files
@new_remote_files ||= if attributes[:remote_files].present? && object.present? && object.file_sets.present?
attributes[:remote_files].select do |file|
existing = object.file_sets.detect {|f| f.import_url && f.import_url == file[:url]}
!existing
attributes[:remote_files].reject do |file|
existing = object.file_sets.detect { |f| f.import_url && f.import_url == file[:url] }
existing
end
elsif attributes[:remote_files].present?
attributes[:remote_files]
end
end

def file_paths
attributes[:file].map { |file_name| File.join(files_directory, file_name) } if attributes[:file]
attributes[:file]&.map { |file_name| File.join(files_directory, file_name) }
end

def import_files
file_paths.map { |path| import_file(path) }
end

def import_file(path)
Expand Down
4 changes: 4 additions & 0 deletions app/matchers/bulkrax/csv_matcher.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
module Bulkrax
class CsvMatcher < ApplicationMatcher
end
end
2 changes: 2 additions & 0 deletions app/models/bulkrax/concerns/has_matchers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def matcher(name, args={})
def add_metadata(node_name, node_content)
matcher = self.class.matchers[node_name]

return unless factory_class.method_defined?(node_name.to_sym) || node_name == 'file'

if matcher
result = matcher.result(self, node_content)
if result
Expand Down
60 changes: 60 additions & 0 deletions app/models/bulkrax/csv_entry.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
require 'csv'

module Bulkrax
class CsvEntry < Entry
include Bulkrax::Concerns::HasMatchers

serialize :raw_metadata, JSON

matcher 'contributor', split: true
matcher 'creator', split: true
matcher 'date', split: true
matcher 'description'
matcher 'format_digital', parsed: true
matcher 'format_original', parsed: true
matcher 'identifier'
matcher 'language', parsed: true, split: true
matcher 'place'
matcher 'publisher', split: true
matcher 'rights_statement'
matcher 'subject', split: true
matcher 'title'
matcher 'alternative_title'
matcher 'types', from: %w[types type], split: true, parsed: true
matcher 'file', split: true

def build_metadata
self.parsed_metadata = {}

if record.nil?
raise StandardError, 'Record not found'
elsif required_elements?(record.keys) == false
raise StandardError, "Missing required elements, required elements are: #{required_elements.join(', ')}"
end

record.each do |key, value|
add_metadata(key, value)
end
add_visibility
add_rights_statement

parsed_metadata
end

def record
@record ||= raw_metadata
end

def matcher_class
Bulkrax::CsvMatcher
end

def required_elements?(keys)
!required_elements.map { |el| keys.include?(el) }.include?(false)
end

def required_elements
%w[title identifier]
end
end
end
42 changes: 34 additions & 8 deletions app/models/bulkrax/entry.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,11 @@ class Entry < ApplicationRecord
def build
# attributes, files_dir = nil, files = [], user = nil
begin
@item = Bulkrax::ApplicationFactory.for(entry_class.to_s).new(build_metadata, nil, [], user).run
@item = Bulkrax::ApplicationFactory.for(factory_class.to_s).new(build_metadata, parser.files_path, [], user).run
rescue => e
self.last_error = "#{e.message}\n\n#{e.backtrace}"
self.last_error_at = Time.now
self.last_exception = e
status_info(e)
else
self.last_error = nil
self.last_error_at = nil
self.last_exception = nil
self.last_succeeded_at = Time.now
status_info
self.collection_id = @item.id if @item.is_a?(Collection)
end
return @item
Expand All @@ -39,6 +34,24 @@ def build_metadata
raise 'Not Implemented'
end

def add_visibility
self.parsed_metadata['visibility'] = 'open' if self.parsed_metadata['visibility'].blank?
end

def add_rights_statement
if override_rights_statement || self.parsed_metadata['rights_statement'].blank?
self.parsed_metadata['rights_statement'] = [parser.parser_fields['rights_statement']]
end
end

def override_rights_statement
['true', '1'].include?(parser.parser_fields['override_rights_statement'].to_s)
end

def factory_class
Work
end

def status
if self.last_error_at.present?
'failed'
Expand All @@ -57,5 +70,18 @@ def status_at
self.last_error_at
end
end

def status_info(e = nil)
if e.nil?
self.last_error = nil
self.last_error_at = nil
self.last_exception = nil
self.last_succeeded_at = Time.now
else
self.last_error = "#{e.message}\n\n#{e.backtrace}"
self.last_error_at = Time.now
self.last_exception = e
end
end
end
end
11 changes: 3 additions & 8 deletions app/models/bulkrax/oai_entry.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@

module Bulkrax
class OaiEntry < Entry
def entry_class
Work
end

def raw_record
@raw_record ||= client.get_record({identifier: identifier, metadata_prefix: parser.parser_fields['metadata_prefix'] })
Expand Down Expand Up @@ -47,12 +44,10 @@ def build_metadata
end
add_metadata('thumbnail_url', thumbnail_url)

self.parsed_metadata['contributing_institution'] = [contributing_institution]
if override_rights_statement || self.parsed_metadata['rights_statement'].blank?
self.parsed_metadata['rights_statement'] = [rights_statement]
end
self.parsed_metadata['visibility'] = 'open'
add_visibility
add_rights_statement
self.parsed_metadata['source'] ||= [record.header.identifier]
self.parsed_metadata['contributing_institution'] = [contributing_institution]

if collection.present?
self.parsed_metadata['collections'] ||= []
Expand Down
2 changes: 2 additions & 0 deletions app/parsers/bulkrax/application_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def records(opts = {})
raise 'must be defined'
end

def files_path; end

def record(identifier, opts = {})
return @record if @record

Expand Down
58 changes: 58 additions & 0 deletions app/parsers/bulkrax/csv_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
module Bulkrax
class CsvParser < ApplicationParser
delegate :errors, :increment_counters, :parser_fields, to: :importer

def self.parser_fields
{
csv_path: :string,
rights_statements: :string,
override_rights_statement: :boolean
}
end

def run
create_works
end

def records(_opts = {})
CSV.foreach(
parser_fields['csv_path'],
headers: true,
header_converters: :symbol,
encoding: 'utf-8'
)
end

def create_works
records.with_index(0) do |record, index|
next if record[:identifier].blank?
break if !limit.nil? && index >= limit

seen[record[:identifier]] = true
new_entry = entry_class.where(importer: importer, identifier: record[:identifier], raw_metadata: record.to_h.compact).first_or_create!
ImportWorkJob.perform_later(new_entry.id, importer.current_importer_run.id)
increment_counters(index)
end
rescue StandardError => e
errors.add(:base, e.class.to_s.to_sym, message: e.message)
end

def files_path
arr = parser_fields['csv_path'].split('/')
arr.pop
arr << 'files'
arr.join('/')
end

def entry_class
CsvEntry
end

# See https://stackoverflow.com/questions/2650517/count-the-number-of-lines-in-a-file-without-reading-entire-file-into-memory
def total
@total ||= `wc -l #{parser_fields['csv_path']}`.to_i -1
rescue StandardError
@total = 0
end
end
end
12 changes: 12 additions & 0 deletions app/views/bulkrax/importers/_csv_fields.html.erb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<div class='csv_fields'>
<%= fi.input :csv_path, as: :string, input_html: { value: importer.parser_fields['csv_path'] } %>
<% rights_statements = Hyrax.config.rights_statement_service_class.new %>
<%= fi.input :rights_statement,
collection: rights_statements.select_active_options,
selected: importer.parser_fields['rights_statement'],
include_blank: true,
item_helper: rights_statements.method(:include_current_value),
input_html: { class: 'form-control' } %>
<%= fi.input :override_rights_statement, as: :boolean, hint: 'If checked, always use the selected rights statment. If unchecked, use dc:rights from the record and only use the provided value if dc:rights is blank.', input_html: { checked: (importer.parser_fields['override_rights_statement'] == "1") } %>

</div>
1 change: 1 addition & 0 deletions app/views/bulkrax/importers/_form.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
<%= form.input :name %>
<%= form.input :admin_set_id, collection: available_admin_sets %>
<%= form.hidden_field :user_id, value: current_user.id %>
<%= form.input :frequency, collection: form.object.frequency_enums %>
Expand Down
3 changes: 2 additions & 1 deletion lib/bulkrax.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ class << self
mattr_accessor :parsers
self.parsers = [
{ name: "OAI - Dublin Core", class_name: "Bulkrax::OaiDcParser", partial: 'oai_fields' },
{ name: "OAI - Qualified Dublin Core", class_name: "Bulkrax::OaiQualifiedDcParser", partial: 'oai_fields' }
{ name: "OAI - Qualified Dublin Core", class_name: "Bulkrax::OaiQualifiedDcParser", partial: 'oai_fields' },
{ name: "CSV - Comma Separated Values", class_name: "Bulkrax::CsvParser", partial: 'csv_fields' }
]
end

Expand Down
2 changes: 1 addition & 1 deletion spec/factories/bulkrax/entries.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FactoryBot.define do
factory :bulkrax_entry, class: 'Entry' do
factory :bulkrax_entry, class: 'Bulkrax::Entry' do
identifier { "MyString" }
type { "" }
importer { nil }
Expand Down
Loading

0 comments on commit 3aad371

Please sign in to comment.