diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb index 60ee7ef..4900042 100755 --- a/lib/docsplit/command_line.rb +++ b/lib/docsplit/command_line.rb @@ -101,6 +101,9 @@ def parse_options opts.on('-r', '--rolling', 'generate images from each previous image') do |r| @options[:rolling] = true end + opts.on('--delimiter [DELIMITER]', 'set page number delimiter (eg: _, -, -Page-...)') do |d| + @options[:delimiter] = d.tr('^','') + end opts.on_tail('-v', '--version', 'display docsplit version') do puts "Docsplit version #{Docsplit::VERSION}" exit diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 8c29bbc..0138034 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -7,6 +7,7 @@ class ImageExtractor MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" DEFAULT_FORMAT = :png DEFAULT_DENSITY = '150' + DEFAULT_PAGE_DELIMITER = "_" # Extract a list of PDFs as rasterized page images, according to the # configuration in options. @@ -41,7 +42,7 @@ def convert(pdf, size, format, previous=nil) raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| - out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] + out_file = ESCAPE[File.join(directory, "#{basename}#{@delimiter}#{page}.#{format}")] cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 @@ -63,6 +64,7 @@ def extract_options(options) @sizes = [options[:size]].flatten.compact @sizes = [nil] if @sizes.empty? @rolling = !!options[:rolling] + @delimiter = options[:delimiter] || DEFAULT_PAGE_DELIMITER end # If there's only one size requested, generate the images directly into diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb index 12069c7..325e6fb 100644 --- a/lib/docsplit/pdf_extractor.rb +++ b/lib/docsplit/pdf_extractor.rb @@ -16,7 +16,7 @@ def osx? def linux? !!HOST_OS.match(/linux/i) end - + # The first line of the help output holds the name and version number # of the office software to be used for extraction. def version_string @@ -35,10 +35,10 @@ def libre_office? def open_office? !!version_string.match(/^OpenOffice.org/) end - + # A set of default locations to search for office software # These have been extracted from JODConverter. Each listed - # path should contain a directory "program" which in turn + # path should contain a directory "program" which in turn # contains the "soffice" executable. # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91 def office_search_paths @@ -56,6 +56,7 @@ def office_search_paths search_paths = %w( /usr/lib/libreoffice /usr/lib64/libreoffice + /usr/bin/libreoffice /opt/libreoffice /usr/lib/openoffice /usr/lib64/openoffice @@ -65,7 +66,7 @@ def office_search_paths end search_paths end - + # Identify the path to a working office executable. def office_executable paths = office_search_paths @@ -77,7 +78,7 @@ def office_executable raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH'] paths.unshift(ENV['OFFICE_PATH']) end - + # The location of the office executable is OS dependent path_pieces = ["soffice"] if windows? @@ -87,7 +88,7 @@ def office_executable else path_pieces += [["program", "soffice"]] end - + # Search for the first suitable office executable # and short circuit an executable is found. paths.each do |path| @@ -103,12 +104,12 @@ def office_executable raise OfficeNotFound, "No office software found" unless @@executable @@executable end - + # Used to specify the office location for JODConverter def office_path File.dirname(File.dirname(office_executable)) end - + # Convert documents to PDF. def extract(docs, opts) out = opts[:output] || '.' @@ -124,7 +125,7 @@ def extract(docs, opts) if libre_office? # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other. ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}" - + options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}" cmd = "#{office_executable} #{options} 2>&1" result = `#{cmd}`.chomp @@ -143,9 +144,9 @@ def extract(docs, opts) LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties" HEADLESS = "-Djava.awt.headless=true" - + private - + # Runs a Java command, with quieted logging, and the classpath set properly. def run_jod(command, pdfs, opts, return_output=false)