Skip to content

Commit e46a9d8

Browse files
committed
Parse XLSX files with unusual structure
Normally you will find `workbook.xml` in the `xl` directory, but recently we got a file where it was located in the root. To find the actual location of workbook, we must parse the file `_rels/.rels`. A separate rels file located relative to the workbook then points out the locations of the sheets, styles and shared strings.
1 parent 2ce2466 commit e46a9d8

File tree

5 files changed

+50
-10
lines changed

5 files changed

+50
-10
lines changed

lib/creek/book.rb

+28-6
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ class Creek::Book
88
attr_reader :files,
99
:sheets,
1010
:shared_strings,
11-
:with_headers
11+
:with_headers,
12+
:workbook_rels_by_type,
13+
:workbook_rels_by_id
1214

1315
DATE_1900 = Date.new(1899, 12, 30).freeze
1416
DATE_1904 = Date.new(1904, 1, 1).freeze
@@ -21,12 +23,14 @@ def initialize path, options = {}
2123
end
2224
path = download_file(path) if options[:remote]
2325
@files = Zip::File.open(path)
26+
parse_workbook_path
27+
parse_workbook_rels
2428
@shared_strings = SharedStrings.new(self)
2529
@with_headers = options.fetch(:with_headers, false)
2630
end
2731

2832
def sheets
29-
doc = @files.file.open "xl/workbook.xml"
33+
doc = @files.file.open @workbook_path
3034
xml = Nokogiri::XML::Document.parse doc
3135
namespaces = xml.namespaces
3236

@@ -37,10 +41,8 @@ def sheets
3741
end
3842
end
3943

40-
rels_doc = @files.file.open "xl/_rels/workbook.xml.rels"
41-
rels = Nokogiri::XML::Document.parse(rels_doc).css("Relationship")
4244
@sheets = xml.css(cssPrefix+'sheet').map do |sheet|
43-
sheetfile = rels.find { |el| sheet.attr("r:id") == el.attr("Id") }.attr("Target")
45+
sheetfile = @workbook_rels_by_id[sheet.attr("r:id")]
4446
sheet = Sheet.new(
4547
self,
4648
sheet.attr("name"),
@@ -71,7 +73,7 @@ def base_date
7173
# http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx
7274
result = DATE_1900 # default
7375

74-
doc = @files.file.open "xl/workbook.xml"
76+
doc = @files.file.open @workbook_path
7577
xml = Nokogiri::XML::Document.parse doc
7678
xml.css('workbookPr[date1904]').each do |workbookPr|
7779
if workbookPr['date1904'] =~ /true|1/i
@@ -98,5 +100,25 @@ def download_file(url)
98100
downloaded.path
99101
end
100102
end
103+
104+
def parse_workbook_path
105+
rels_file = @files.file.open '_rels/.rels'
106+
rels_xml = Nokogiri::XML::Document.parse(rels_file).css('Relationship')
107+
rel = rels_xml.find { |el| el.attr('Type') == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" }
108+
@workbook_path = rel.attr('Target')
109+
end
110+
111+
def parse_workbook_rels
112+
@workbook_rels_by_id = {}
113+
@workbook_rels_by_type = {}
114+
workbook_dirname, slash, workbook_basename = @workbook_path.rpartition('/')
115+
workbook_rels_file = @files.file.open "#{workbook_dirname}#{slash}_rels/#{workbook_basename}.rels"
116+
Nokogiri::XML::Document.parse(workbook_rels_file).css('Relationship').each do |rel|
117+
target = rel.attr('Target')
118+
target = "#{workbook_dirname}#{slash}#{target}" unless target.start_with?('/')
119+
@workbook_rels_by_id[rel.attr('Id')] = target
120+
@workbook_rels_by_type[rel.attr('Type')] = target
121+
end
122+
end
101123
end
102124
end

lib/creek/shared_strings.rb

+2-2
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ def initialize book
1313
end
1414

1515
def parse_shared_shared_strings
16-
path = "xl/sharedStrings.xml"
17-
if @book.files.file.exist?(path)
16+
path = @book.workbook_rels_by_type["http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings"]
17+
if path
1818
doc = @book.files.file.open path
1919
xml = Nokogiri::XML::Document.parse doc
2020
parse_shared_string_from_document(xml)

lib/creek/styles.rb

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@ def initialize(book)
66
end
77

88
def path
9-
"xl/styles.xml"
9+
@book.workbook_rels_by_type["http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"]
1010
end
1111

1212
def styles_xml
1313
@styles_xml ||= begin
14-
if @book.files.file.exist?(path)
14+
if path
1515
doc = @book.files.file.open path
1616
Nokogiri::XML::Document.parse doc
1717
end

spec/fixtures/test_structure.xlsx

5.12 KB
Binary file not shown.

spec/test_spec.rb

+18
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,21 @@
166166
expect(rows.map{|r| r['cells']}).to eq(@expected_rows)
167167
end
168168
end
169+
170+
describe 'Creek parsing a file with unusual structure.' do
171+
before(:all) do
172+
@creek = Creek::Book.new 'spec/fixtures/test_structure.xlsx'
173+
@expected_rows = [
174+
{"A1"=>"date", "B1"=>"dimension", "C1"=>"metric_1", "D1"=>"metric_2", "E1"=>"metric_3"},
175+
{"A2"=>"2022-02-27", "B2"=>"A", "C2"=>"1", "D2"=>"5.30", "E2"=>"11:11:11"}
176+
]
177+
end
178+
179+
after(:all) do
180+
@creek.close
181+
end
182+
183+
it 'Parse rows successfully.' do
184+
expect(@creek.sheets[0].rows.to_a).to eq(@expected_rows)
185+
end
186+
end

0 commit comments

Comments
 (0)