Skip to content

Commit 4c5ba50

Browse files
authored
Merge pull request #132 from scarfacedeb/layout_option
Add layout option to keep layout during text extraction
2 parents 2a63a81 + c42ce95 commit 4c5ba50

File tree

1 file changed

+12
-2
lines changed

1 file changed

+12
-2
lines changed

lib/docsplit/text_extractor.rb

+12-2
Original file line numberDiff line numberDiff line change
@@ -102,17 +102,26 @@ def run(command)
102102
result
103103
end
104104

105+
# Run pdftotext command
106+
def run_pdftotext(pdf, text_path, options=[])
107+
options << '-enc UTF-8'
108+
options << '-layout' if @keep_layout
109+
110+
run "pdftotext #{options.join(' ')} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
111+
end
112+
105113
# Extract the full contents of a pdf as a single file, directly.
106114
def extract_full(pdf)
107115
text_path = File.join(@output, "#{@pdf_name}.txt")
108-
run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
116+
run_pdftotext pdf, text_path
109117
end
110118

111119
# Extract the contents of a single page of text, directly, adding it to
112120
# the `@pages_to_ocr` list if the text length is inadequate.
113121
def extract_page(pdf, page)
114122
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
115-
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
123+
run_pdftotext pdf, text_path, ["-f #{page}", "-l #{page}"]
124+
116125
unless @forbid_ocr
117126
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
118127
end
@@ -126,6 +135,7 @@ def extract_options(options)
126135
@language = options[:language] || 'eng'
127136
@clean_ocr = (!(options[:clean] == false) and @language == 'eng')
128137
@detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
138+
@keep_layout = options.fetch(:layout, false)
129139
end
130140

131141
end

0 commit comments

Comments
 (0)