@@ -102,17 +102,26 @@ def run(command)
102
102
result
103
103
end
104
104
105
+ # Run pdftotext command
106
+ def run_pdftotext ( pdf , text_path , options = [ ] )
107
+ options << '-enc UTF-8'
108
+ options << '-layout' if @keep_layout
109
+
110
+ run "pdftotext #{ options . join ( ' ' ) } #{ ESCAPE [ pdf ] } #{ ESCAPE [ text_path ] } 2>&1"
111
+ end
112
+
105
113
# Extract the full contents of a pdf as a single file, directly.
106
114
def extract_full ( pdf )
107
115
text_path = File . join ( @output , "#{ @pdf_name } .txt" )
108
- run "pdftotext -enc UTF-8 #{ ESCAPE [ pdf ] } #{ ESCAPE [ text_path ] } 2>&1"
116
+ run_pdftotext pdf , text_path
109
117
end
110
118
111
119
# Extract the contents of a single page of text, directly, adding it to
112
120
# the `@pages_to_ocr` list if the text length is inadequate.
113
121
def extract_page ( pdf , page )
114
122
text_path = File . join ( @output , "#{ @pdf_name } _#{ page } .txt" )
115
- run "pdftotext -enc UTF-8 -f #{ page } -l #{ page } #{ ESCAPE [ pdf ] } #{ ESCAPE [ text_path ] } 2>&1"
123
+ run_pdftotext pdf , text_path , [ "-f #{ page } " , "-l #{ page } " ]
124
+
116
125
unless @forbid_ocr
117
126
@pages_to_ocr . push ( page ) if File . read ( text_path ) . length < MIN_TEXT_PER_PAGE
118
127
end
@@ -126,6 +135,7 @@ def extract_options(options)
126
135
@language = options [ :language ] || 'eng'
127
136
@clean_ocr = ( !( options [ :clean ] == false ) and @language == 'eng' )
128
137
@detect_orientation = ( ( options [ :detect_orientation ] != false ) and DEPENDENCIES [ :osd ] )
138
+ @keep_layout = options . fetch ( :layout , false )
129
139
end
130
140
131
141
end
0 commit comments