-
Notifications
You must be signed in to change notification settings - Fork 160
Open
Description
Hello, not sure this is strictly speaking a driver issue but it's where I encounter it. I'm essentially running this query:
install httpfs;
load httpfs;
install spatial;
load spatial;
set s3_access_key_id = '';
set s3_secret_access_key = '';
set s3_region='us-west-2';
create table place as select * from read_parquet('s3://overturemaps-us-west-2/release/2025-03-19.0/theme=places/type=place/*') where bbox.xmin > -97.734375 and bbox.xmax < -97.3828125 and bbox.ymin > 30.234375 and bbox.ymax < 30.41015625;
nolan@flynode ~/P/Waynav (main) [1]> time duckdb -s "install httpfs; load httpfs
; install spatial; load spatial; set s3_access_key_id = ''; set s3_secret_access
_key = ''; set s3_region='us-west-2'; create table place as select * from read_p
arquet('s3://overturemaps-us-west-2/release/2025-03-19.0/theme=places/type=place
/*') where bbox.xmin > -97.734375 and bbox.xmax < -97.3828125 and bbox.ymin > 30
.234375 and bbox.ymax < 30.41015625;"
100% ▕60 █ characters▏
56 _ characters
Executed in 18.94 secs fish external
usr time 13.93 secs 2.11 millis 13.93 secs
sys time 0.36 secs 0.09 millis 0.36 secs
Now I want to run that same set of queries via Rust, so I wrote this:
use duckdb::{params, Connection};
const ROOT: &str = "s3://overturemaps-us-west-2/release/2025-03-19.0";
fn main() -> Result<(), Box<dyn std::error::Error>> {
let conn = Connection::open_in_memory()?;
let _ = conn.execute_batch(
r#"install httpfs;
load httpfs;
install spatial;
load spatial;
set s3_access_key_id = '';
set s3_secret_access_key = '';
set s3_region='us-west-2';
"#,
)?;
let table = "place";
let theme = "theme=places/type=place";
let xmin = -97.734375;
let xmax = -97.3828125;
let ymin = 30.234375;
let ymax = 30.41015625;
let root = format!("{ROOT}/{theme}/*");
println!("Making table");
println!(r#"create table {table} as select * from read_parquet("{root}") where bbox.xmin > {xmin} and bbox.xmax < {xmax} and bbox.ymin > {ymin} and bbox.ymax < {ymax};"#);
let query = format!(r#"create table {table} as select * from read_parquet(?) where bbox.xmin > ? and bbox.xmax < ? and bbox.ymin > ? and bbox.ymax < ?;"#);
conn.execute(&query, params![root, xmin, xmax, ymin, ymax,])?;
println!("Counting");
conn.query_row("select count() from place;", [], |row| {
let count: usize = row.get(0).unwrap();
println!("{count}");
Ok(())
})?;
Ok(())
}
But it runs far slower:
nolan@flynode ~/P/duckdb_test> time cargo run --release
Finished `release` profile [optimized] target(s) in 0.22s
Running `target/release/duckdb_test`
Making table
create table place as select * from read_parquet("s3://overturemaps-us-west-2/re
lease/2025-03-19.0/theme=places/type=place/*") where bbox.xmin > -97.734375 and
bbox.xmax < -97.3828125 and bbox.ymin > 30.234375 and bbox.ymax < 30.41015625;
Counting
16119
56 _ characters
Executed in 547.47 secs fish external
usr time 109.85 secs 253.00 micros 109.85 secs
sys time 24.08 secs 278.00 micros 24.08 secs
I know the Rust code does a bit more with the count, but even so it takes a long time to get there.
Is there some tuning the CLI does that makes this workload run orders of magnitude faster there? I wouldn't sweat if it was a small difference but this is huge enough to make the project I'm working on non-viable.
Thanks for any help.
pmetras
Metadata
Metadata
Assignees
Labels
No labels