Skip to content

Commit 99ebe62

Browse files
Fix rewind bug
1 parent 34ccba4 commit 99ebe62

File tree

2 files changed

+81
-1
lines changed

2 files changed

+81
-1
lines changed

datafusion/datasource-arrow/src/file_format.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use std::any::Any;
2323
use std::borrow::Cow;
2424
use std::collections::HashMap;
2525
use std::fmt::{self, Debug};
26+
use std::io::{Seek, SeekFrom};
2627
use std::sync::Arc;
2728

2829
use arrow::datatypes::{Schema, SchemaRef};
@@ -152,7 +153,13 @@ impl FileFormat for ArrowFormat {
152153
GetResultPayload::File(mut file, _) => {
153154
match FileReader::try_new(&mut file, None) {
154155
Ok(reader) => reader.schema(),
155-
Err(_) => StreamReader::try_new(&mut file, None)?.schema(),
156+
Err(_) => {
157+
// not in the file format, but FileReader read some bytes
158+
// while trying to parse the file and so we need to rewind
159+
// it to the beginning of the file
160+
file.seek(SeekFrom::Start(0))?;
161+
StreamReader::try_new(&mut file, None)?.schema()
162+
}
156163
}
157164
}
158165
GetResultPayload::Stream(stream) => infer_ipc_schema(stream).await?,

datafusion/sqllogictest/test_files/arrow_files.slt

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,76 @@ physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/
128128
# Errors in partition filters should be reported
129129
query error Divide by zero error
130130
SELECT f0 FROM arrow_partitioned WHERE CASE WHEN true THEN 1 / 0 ELSE part END = 1;
131+
132+
#############
133+
## Arrow IPC stream format support
134+
#############
135+
136+
# Test CREATE EXTERNAL TABLE with stream format
137+
statement ok
138+
CREATE EXTERNAL TABLE arrow_stream
139+
STORED AS ARROW
140+
LOCATION '../datasource-arrow/tests/data/example_stream.arrow';
141+
142+
# physical plan for stream format
143+
query TT
144+
EXPLAIN SELECT * FROM arrow_stream
145+
----
146+
logical_plan TableScan: arrow_stream projection=[f0, f1, f2]
147+
physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/datasource-arrow/tests/data/example_stream.arrow]]}, projection=[f0, f1, f2], file_type=arrow_stream
148+
149+
# stream format should return same data as file format
150+
query ITB
151+
SELECT * FROM arrow_stream
152+
----
153+
1 foo true
154+
2 bar NULL
155+
3 baz false
156+
4 NULL true
157+
158+
# Verify both file and stream formats return identical results
159+
query ITB
160+
SELECT * FROM arrow_simple ORDER BY f0
161+
----
162+
1 foo true
163+
2 bar NULL
164+
3 baz false
165+
4 NULL true
166+
167+
query ITB
168+
SELECT * FROM arrow_stream ORDER BY f0
169+
----
170+
1 foo true
171+
2 bar NULL
172+
3 baz false
173+
4 NULL true
174+
175+
# Both formats should support projection pushdown
176+
query IT
177+
SELECT f0, f1 FROM arrow_simple ORDER BY f0
178+
----
179+
1 foo
180+
2 bar
181+
3 baz
182+
4 NULL
183+
184+
query IT
185+
SELECT f0, f1 FROM arrow_stream ORDER BY f0
186+
----
187+
1 foo
188+
2 bar
189+
3 baz
190+
4 NULL
191+
192+
# Both formats should support filtering
193+
query ITB
194+
SELECT * FROM arrow_simple WHERE f0 > 2 ORDER BY f0
195+
----
196+
3 baz false
197+
4 NULL true
198+
199+
query ITB
200+
SELECT * FROM arrow_stream WHERE f0 > 2 ORDER BY f0
201+
----
202+
3 baz false
203+
4 NULL true

0 commit comments

Comments
 (0)