-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Support JSON arrays reader/parse for datafusion #19924
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,6 +47,7 @@ mod tests { | |
| use datafusion_common::stats::Precision; | ||
|
|
||
| use datafusion_common::Result; | ||
| use datafusion_datasource::file_compression_type::FileCompressionType; | ||
| use futures::StreamExt; | ||
| use insta::assert_snapshot; | ||
| use object_store::local::LocalFileSystem; | ||
|
|
@@ -391,4 +392,276 @@ mod tests { | |
| assert_eq!(metadata.len(), 0); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_json_array_format() -> Result<()> { | ||
| let session = SessionContext::new(); | ||
| let ctx = session.state(); | ||
| let store = Arc::new(LocalFileSystem::new()) as _; | ||
|
|
||
| // Create a temporary file with JSON array format | ||
| let tmp_dir = tempfile::TempDir::new()?; | ||
| let path = format!("{}/array.json", tmp_dir.path().to_string_lossy()); | ||
| std::fs::write( | ||
| &path, | ||
| r#"[ | ||
| {"a": 1, "b": 2.0, "c": true}, | ||
| {"a": 2, "b": 3.5, "c": false}, | ||
| {"a": 3, "b": 4.0, "c": true} | ||
| ]"#, | ||
| )?; | ||
|
|
||
|
Comment on lines
+398
to
+413
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this standard preamble is could be reduced so there were fewer test lines (and thus it was easier to veriy what was being tested) For example, it looks like you maybe could make a function like let file_schema = create_json_with_format({..}", format);I bet the tests would be less than half the size |
||
| // Test with format_array = true | ||
| let format = JsonFormat::default().with_format_array(true); | ||
| let file_schema = format | ||
| .infer_schema(&ctx, &store, &[local_unpartitioned_file(&path)]) | ||
| .await | ||
| .expect("Schema inference"); | ||
|
|
||
| let fields = file_schema | ||
| .fields() | ||
| .iter() | ||
| .map(|f| format!("{}: {:?}", f.name(), f.data_type())) | ||
| .collect::<Vec<_>>(); | ||
| assert_eq!(vec!["a: Int64", "b: Float64", "c: Boolean"], fields); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_json_array_format_empty() -> Result<()> { | ||
| let session = SessionContext::new(); | ||
| let ctx = session.state(); | ||
| let store = Arc::new(LocalFileSystem::new()) as _; | ||
|
|
||
| let tmp_dir = tempfile::TempDir::new()?; | ||
| let path = format!("{}/empty_array.json", tmp_dir.path().to_string_lossy()); | ||
| std::fs::write(&path, "[]")?; | ||
|
|
||
| let format = JsonFormat::default().with_format_array(true); | ||
| let result = format | ||
| .infer_schema(&ctx, &store, &[local_unpartitioned_file(&path)]) | ||
| .await; | ||
|
|
||
| assert!(result.is_err()); | ||
| assert!( | ||
| result | ||
| .unwrap_err() | ||
| .to_string() | ||
| .contains("JSON array is empty") | ||
| ); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_json_array_format_with_limit() -> Result<()> { | ||
| let session = SessionContext::new(); | ||
| let ctx = session.state(); | ||
| let store = Arc::new(LocalFileSystem::new()) as _; | ||
|
|
||
| let tmp_dir = tempfile::TempDir::new()?; | ||
| let path = format!("{}/array_limit.json", tmp_dir.path().to_string_lossy()); | ||
| std::fs::write( | ||
| &path, | ||
| r#"[ | ||
| {"a": 1}, | ||
| {"a": 2, "b": "extra"} | ||
| ]"#, | ||
| )?; | ||
|
|
||
| // Only infer from first record | ||
| let format = JsonFormat::default() | ||
| .with_format_array(true) | ||
| .with_schema_infer_max_rec(1); | ||
|
|
||
| let file_schema = format | ||
| .infer_schema(&ctx, &store, &[local_unpartitioned_file(&path)]) | ||
| .await | ||
| .expect("Schema inference"); | ||
|
|
||
| // Should only have field "a" since we limited to 1 record | ||
| let fields = file_schema | ||
| .fields() | ||
| .iter() | ||
| .map(|f| format!("{}: {:?}", f.name(), f.data_type())) | ||
| .collect::<Vec<_>>(); | ||
| assert_eq!(vec!["a: Int64"], fields); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_json_array_format_read_data() -> Result<()> { | ||
| let session = SessionContext::new(); | ||
| let ctx = session.state(); | ||
| let task_ctx = ctx.task_ctx(); | ||
| let store = Arc::new(LocalFileSystem::new()) as _; | ||
|
|
||
| // Create a temporary file with JSON array format | ||
| let tmp_dir = tempfile::TempDir::new()?; | ||
| let path = format!("{}/array.json", tmp_dir.path().to_string_lossy()); | ||
| std::fs::write( | ||
| &path, | ||
| r#"[ | ||
| {"a": 1, "b": 2.0, "c": true}, | ||
| {"a": 2, "b": 3.5, "c": false}, | ||
| {"a": 3, "b": 4.0, "c": true} | ||
| ]"#, | ||
| )?; | ||
|
|
||
| let format = JsonFormat::default().with_format_array(true); | ||
|
|
||
| // Infer schema | ||
| let file_schema = format | ||
| .infer_schema(&ctx, &store, &[local_unpartitioned_file(&path)]) | ||
| .await?; | ||
|
|
||
| // Scan and read data | ||
| let exec = scan_format( | ||
| &ctx, | ||
| &format, | ||
| Some(file_schema), | ||
| tmp_dir.path().to_str().unwrap(), | ||
| "array.json", | ||
| None, | ||
| None, | ||
| ) | ||
| .await?; | ||
| let batches = collect(exec, task_ctx).await?; | ||
|
|
||
| assert_eq!(1, batches.len()); | ||
| assert_eq!(3, batches[0].num_columns()); | ||
| assert_eq!(3, batches[0].num_rows()); | ||
|
|
||
| // Verify data | ||
| let array_a = as_int64_array(batches[0].column(0))?; | ||
| assert_eq!( | ||
| vec![1, 2, 3], | ||
| (0..3).map(|i| array_a.value(i)).collect::<Vec<_>>() | ||
| ); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_json_array_format_with_projection() -> Result<()> { | ||
| let session = SessionContext::new(); | ||
| let ctx = session.state(); | ||
| let task_ctx = ctx.task_ctx(); | ||
| let store = Arc::new(LocalFileSystem::new()) as _; | ||
|
|
||
| let tmp_dir = tempfile::TempDir::new()?; | ||
| let path = format!("{}/array.json", tmp_dir.path().to_string_lossy()); | ||
| std::fs::write(&path, r#"[{"a": 1, "b": "hello"}, {"a": 2, "b": "world"}]"#)?; | ||
|
|
||
| let format = JsonFormat::default().with_format_array(true); | ||
| let file_schema = format | ||
| .infer_schema(&ctx, &store, &[local_unpartitioned_file(&path)]) | ||
| .await?; | ||
|
|
||
| // Project only column "a" | ||
| let exec = scan_format( | ||
| &ctx, | ||
| &format, | ||
| Some(file_schema), | ||
| tmp_dir.path().to_str().unwrap(), | ||
| "array.json", | ||
| Some(vec![0]), | ||
| None, | ||
| ) | ||
| .await?; | ||
| let batches = collect(exec, task_ctx).await?; | ||
|
|
||
| assert_eq!(1, batches.len()); | ||
| assert_eq!(1, batches[0].num_columns()); // Only 1 column projected | ||
| assert_eq!(2, batches[0].num_rows()); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_ndjson_read_options_format_array() -> Result<()> { | ||
| let ctx = SessionContext::new(); | ||
|
|
||
| // Create a temporary file with JSON array format | ||
| let tmp_dir = tempfile::TempDir::new()?; | ||
| let path = format!("{}/array.json", tmp_dir.path().to_string_lossy()); | ||
| std::fs::write( | ||
| &path, | ||
| r#"[ | ||
| {"a": 1, "b": "hello"}, | ||
| {"a": 2, "b": "world"}, | ||
| {"a": 3, "b": "test"} | ||
| ]"#, | ||
| )?; | ||
|
|
||
| // Use NdJsonReadOptions with format_array = true | ||
| let options = NdJsonReadOptions::default().format_array(true); | ||
|
|
||
| ctx.register_json("json_array_table", &path, options) | ||
| .await?; | ||
|
|
||
| let result = ctx | ||
| .sql("SELECT a, b FROM json_array_table ORDER BY a") | ||
| .await? | ||
| .collect() | ||
| .await?; | ||
|
|
||
| assert_snapshot!(batches_to_string(&result), @r" | ||
| +---+-------+ | ||
| | a | b | | ||
| +---+-------+ | ||
| | 1 | hello | | ||
| | 2 | world | | ||
| | 3 | test | | ||
| +---+-------+ | ||
| "); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_ndjson_read_options_format_array_with_compression() -> Result<()> { | ||
| use flate2::Compression; | ||
| use flate2::write::GzEncoder; | ||
| use std::io::Write; | ||
|
|
||
| let ctx = SessionContext::new(); | ||
|
|
||
| // Create a temporary gzip compressed JSON array file | ||
| let tmp_dir = tempfile::TempDir::new()?; | ||
| let path = format!("{}/array.json.gz", tmp_dir.path().to_string_lossy()); | ||
|
|
||
| let json_content = r#"[{"a": 1, "b": "hello"}, {"a": 2, "b": "world"}]"#; | ||
| let file = std::fs::File::create(&path)?; | ||
| let mut encoder = GzEncoder::new(file, Compression::default()); | ||
| encoder.write_all(json_content.as_bytes())?; | ||
| encoder.finish()?; | ||
|
|
||
| // Use NdJsonReadOptions with format_array and GZIP compression | ||
| let options = NdJsonReadOptions::default() | ||
| .format_array(true) | ||
| .file_compression_type(FileCompressionType::GZIP) | ||
| .file_extension(".json.gz"); | ||
|
|
||
| ctx.register_json("json_array_gzip", &path, options).await?; | ||
|
|
||
| let result = ctx | ||
| .sql("SELECT a, b FROM json_array_gzip ORDER BY a") | ||
| .await? | ||
| .collect() | ||
| .await?; | ||
|
|
||
| assert_snapshot!(batches_to_string(&result), @r" | ||
| +---+-------+ | ||
| | a | b | | ||
| +---+-------+ | ||
| | 1 | hello | | ||
| | 2 | world | | ||
| +---+-------+ | ||
| "); | ||
|
|
||
| Ok(()) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| [ | ||
| {"a": 1, "b": "hello"}, | ||
| {"a": 2, "b": "world"}, | ||
| {"a": 3, "b": "test"} | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| [] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think
format_arraywill be hard to discover / find and we should call this parameter something more standard.I looked at what other systems did and there is no consistency.
I reviewed Spark's doc and they seem to use 'multiLine =true
for what you have labelledformat_array`https://spark.apache.org/docs/latest/sql-data-sources-json.html
Duckdb seems to call it
format=newline_delimited: https://duckdb.org/docs/stable/data/json/loading_json#parameterspostgres seems to have two separate functions
row_to_jsonandarray_to_jsonhttps://www.postgresql.org/docs/9.5/functions-json.html
I think I prefer the duckdb style
newline_delimitedof the options, though maybe the sparkmultilinewould be more widely understoodThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO it would be better to use an enum here, e.g.
JSON_FORMAT {NDJSON, ARRAY}.It will be more clear than true/false and also easier to extend with a third, fourth, ... formats later