diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 5241cad3f..ec2fd0d91 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -7,6 +7,19 @@ Thanks for sending a pull request! Here are some tips for you: 5. Be sure to keep the PR description updated to reflect all changes. --> + + ## What changes are proposed in this pull request? ๐Ÿ—๏ธ Breaking changes" }, + { message = "^feat", group = "๐Ÿš€ Features / new APIs" }, + { message = "^fix", group = "๐Ÿ› Bug Fixes" }, + { message = "^doc", group = "๐Ÿ“š Documentation" }, + { message = "^perf", group = "โšก Performance" }, + { message = "^refactor", group = "๐Ÿšœ Refactor" }, + { message = "^test", group = "๐Ÿงช Testing" }, + { message = "^chore|^ci", group = "โš™๏ธ Chores/CI" }, + { message = "^revert", group = "โ—€๏ธ Revert" }, + { message = ".*", group = "Other" }, +] +# filter out the commits that are not matched by commit parsers +filter_commits = false +# sort the tags topologically +topo_order = false +# sort the commits inside sections by oldest/newest order +sort_commits = "oldest" diff --git a/feature-tests/Cargo.toml b/feature-tests/Cargo.toml index 6f9827e83..7e45e41e2 100644 --- a/feature-tests/Cargo.toml +++ b/feature-tests/Cargo.toml @@ -8,6 +8,9 @@ repository.workspace = true readme.workspace = true version.workspace = true +[package.metadata.release] +release = false + [dependencies] delta_kernel = { path = "../kernel" } diff --git a/ffi-proc-macros/Cargo.toml b/ffi-proc-macros/Cargo.toml index ce8f1936b..326462247 100644 --- a/ffi-proc-macros/Cargo.toml +++ b/ffi-proc-macros/Cargo.toml @@ -10,6 +10,9 @@ readme.workspace = true rust-version.workspace = true version.workspace = true +[package.metadata.release] +release = false + [lib] proc-macro = true diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml index 08162a505..aa4edc167 100644 --- a/ffi/Cargo.toml +++ b/ffi/Cargo.toml @@ -10,6 +10,9 @@ version.workspace = true rust-version.workspace = true build = "build.rs" +[package.metadata.release] +release = false + [lib] crate-type = ["lib", "cdylib", "staticlib"] @@ -21,16 +24,16 @@ url = "2" delta_kernel = { path = "../kernel", default-features = false, features = [ "developer-visibility", ] } -delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.6.0" } +delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.6.1" } # used if we use the default engine to be able to move arrow data into the c-ffi format -arrow-schema = { version = "53.0", default-features = false, features = [ +arrow-schema = { version = ">=53, <55", default-features = false, features = [ "ffi", ], optional = true } -arrow-data = { version = "53.0", default-features = false, features = [ +arrow-data = { version = ">=53, <55", default-features = false, features = [ "ffi", ], optional = true } -arrow-array = { version = "53.0", default-features = false, optional = true } +arrow-array = { version = ">=53, <55", default-features = false, optional = true } [build-dependencies] cbindgen = "0.27.0" diff --git a/ffi/examples/read-table/arrow.c b/ffi/examples/read-table/arrow.c index d58a2fa2d..7eb32b7c3 100644 --- a/ffi/examples/read-table/arrow.c +++ b/ffi/examples/read-table/arrow.c @@ -97,7 +97,7 @@ static GArrowRecordBatch* add_partition_columns( } GArrowArray* partition_col = garrow_array_builder_finish((GArrowArrayBuilder*)builder, &error); - if (report_g_error("Can't build string array for parition column", error)) { + if (report_g_error("Can't build string array for partition column", error)) { printf("Giving up on column %s\n", col); g_error_free(error); g_object_unref(builder); @@ -144,7 +144,7 @@ static void add_batch_to_context( } record_batch = add_partition_columns(record_batch, partition_cols, partition_values); if (record_batch == NULL) { - printf("Failed to add parition columns, not adding batch\n"); + printf("Failed to add partition columns, not adding batch\n"); return; } context->batches = g_list_append(context->batches, record_batch); diff --git a/ffi/examples/read-table/read_table.c b/ffi/examples/read-table/read_table.c index 0aa8caa41..7b1a7f2c7 100644 --- a/ffi/examples/read-table/read_table.c +++ b/ffi/examples/read-table/read_table.c @@ -43,7 +43,7 @@ void print_partition_info(struct EngineContext* context, const CStringMap* parti } // Kernel will call this function for each file that should be scanned. The arguments include enough -// context to constuct the correct logical data from the physically read parquet +// context to construct the correct logical data from the physically read parquet void scan_row_callback( void* engine_context, KernelStringSlice path, diff --git a/ffi/src/engine_funcs.rs b/ffi/src/engine_funcs.rs index f8534dfc0..1afb60510 100644 --- a/ffi/src/engine_funcs.rs +++ b/ffi/src/engine_funcs.rs @@ -42,7 +42,7 @@ impl Drop for FileReadResultIterator { } } -/// Call the engine back with the next `EngingeData` batch read by Parquet/Json handler. The +/// Call the engine back with the next `EngineData` batch read by Parquet/Json handler. The /// _engine_ "owns" the data that is passed into the `engine_visitor`, since it is allocated by the /// `Engine` being used for log-replay. If the engine wants the kernel to free this data, it _must_ /// call [`free_engine_data`] on it. diff --git a/ffi/src/expressions/kernel.rs b/ffi/src/expressions/kernel.rs index f2ed8b1a3..a5116db47 100644 --- a/ffi/src/expressions/kernel.rs +++ b/ffi/src/expressions/kernel.rs @@ -83,7 +83,7 @@ pub struct EngineExpressionVisitor { /// Visit a 64bit timestamp belonging to the list identified by `sibling_list_id`. /// The timestamp is microsecond precision with no timezone. pub visit_literal_timestamp_ntz: VisitLiteralFn, - /// Visit a 32bit intger `date` representing days since UNIX epoch 1970-01-01. The `date` belongs + /// Visit a 32bit integer `date` representing days since UNIX epoch 1970-01-01. The `date` belongs /// to the list identified by `sibling_list_id`. pub visit_literal_date: VisitLiteralFn, /// Visit binary data at the `buffer` with length `len` belonging to the list identified by diff --git a/ffi/src/handle.rs b/ffi/src/handle.rs index 27b35bea5..30b695ecc 100644 --- a/ffi/src/handle.rs +++ b/ffi/src/handle.rs @@ -2,8 +2,8 @@ //! boundary. //! //! Creating a [`Handle`] always implies some kind of ownership transfer. A mutable handle takes -//! ownership of the object itself (analagous to [`Box`]), while a non-mutable (shared) handle -//! takes ownership of a shared reference to the object (analagous to [`std::sync::Arc`]). Thus, a created +//! ownership of the object itself (analogous to [`Box`]), while a non-mutable (shared) handle +//! takes ownership of a shared reference to the object (analogous to [`std::sync::Arc`]). Thus, a created //! handle remains [valid][Handle#Validity], and its underlying object remains accessible, until the //! handle is explicitly dropped or consumed. Dropping a mutable handle always drops the underlying //! object as well; dropping a shared handle only drops the underlying object if the handle was the diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs index d5695c130..86f5e7e5f 100644 --- a/ffi/src/scan.rs +++ b/ffi/src/scan.rs @@ -383,7 +383,7 @@ struct ContextWrapper { /// data which provides the data handle and selection vector as each element in the iterator. /// /// # Safety -/// engine is responsbile for passing a valid [`ExclusiveEngineData`] and selection vector. +/// engine is responsible for passing a valid [`ExclusiveEngineData`] and selection vector. #[no_mangle] pub unsafe extern "C" fn visit_scan_data( data: Handle, diff --git a/ffi/src/test_ffi.rs b/ffi/src/test_ffi.rs index 27c7063fa..14eec1b86 100644 --- a/ffi/src/test_ffi.rs +++ b/ffi/src/test_ffi.rs @@ -12,7 +12,7 @@ use delta_kernel::{ /// output expression can be found in `ffi/tests/test_expression_visitor/expected.txt`. /// /// # Safety -/// The caller is responsible for freeing the retured memory, either by calling +/// The caller is responsible for freeing the returned memory, either by calling /// [`free_kernel_predicate`], or [`Handle::drop_handle`] #[no_mangle] pub unsafe extern "C" fn get_testing_kernel_expression() -> Handle { diff --git a/integration-tests/src/main.rs b/integration-tests/src/main.rs index 63adb3940..a5bfe0952 100644 --- a/integration-tests/src/main.rs +++ b/integration-tests/src/main.rs @@ -15,8 +15,8 @@ fn create_kernel_schema() -> delta_kernel::schema::Schema { fn main() { let arrow_schema = create_arrow_schema(); let kernel_schema = create_kernel_schema(); - let convereted: delta_kernel::schema::Schema = + let converted: delta_kernel::schema::Schema = delta_kernel::schema::Schema::try_from(&arrow_schema).expect("couldn't convert"); - assert!(kernel_schema == convereted); + assert!(kernel_schema == converted); println!("Okay, made it"); } diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index 459c49177..a045153cf 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -15,6 +15,13 @@ rust-version.workspace = true [package.metadata.docs.rs] all-features = true +[package.metadata.release] +pre-release-replacements = [ + {file="../README.md", search="delta_kernel = \"[a-z0-9\\.-]+\"", replace="delta_kernel = \"{{version}}\""}, + {file="../README.md", search="version = \"[a-z0-9\\.-]+\"", replace="version = \"{{version}}\""}, +] +pre-release-hook = ["git", "cliff", "--repository", "../", "--config", "../cliff.toml", "--unreleased", "--prepend", "../CHANGELOG.md", "--tag", "{{version}}" ] + [dependencies] bytes = "1.7" chrono = { version = "0.4" } @@ -32,7 +39,7 @@ uuid = "1.10.0" z85 = "3.0.5" # bring in our derive macros -delta_kernel_derive = { path = "../derive-macros", version = "0.6.0" } +delta_kernel_derive = { path = "../derive-macros", version = "0.6.1" } # used for developer-visibility visibility = "0.1.1" diff --git a/kernel/examples/inspect-table/Cargo.toml b/kernel/examples/inspect-table/Cargo.toml index 174f84100..b81a8ac5b 100644 --- a/kernel/examples/inspect-table/Cargo.toml +++ b/kernel/examples/inspect-table/Cargo.toml @@ -15,3 +15,6 @@ delta_kernel = { path = "../../../kernel", features = [ ] } env_logger = "0.11.3" url = "2" + +[package.metadata.release] +release = false diff --git a/kernel/examples/inspect-table/src/main.rs b/kernel/examples/inspect-table/src/main.rs index ea25a8404..194530004 100644 --- a/kernel/examples/inspect-table/src/main.rs +++ b/kernel/examples/inspect-table/src/main.rs @@ -184,7 +184,7 @@ fn print_scan_file( fn try_main() -> DeltaResult<()> { let cli = Cli::parse(); - // build a table and get the lastest snapshot from it + // build a table and get the latest snapshot from it let table = Table::try_from_uri(&cli.path)?; let engine = DefaultEngine::try_new( diff --git a/kernel/examples/read-table-changes/Cargo.toml b/kernel/examples/read-table-changes/Cargo.toml index f9f980dc2..181da7dc6 100644 --- a/kernel/examples/read-table-changes/Cargo.toml +++ b/kernel/examples/read-table-changes/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" edition = "2021" publish = false +[package.metadata.release] +release = false + [dependencies] arrow-array = { workspace = true } arrow-schema = { workspace = true } diff --git a/kernel/examples/read-table-multi-threaded/Cargo.toml b/kernel/examples/read-table-multi-threaded/Cargo.toml index 178435a38..3362e579a 100644 --- a/kernel/examples/read-table-multi-threaded/Cargo.toml +++ b/kernel/examples/read-table-multi-threaded/Cargo.toml @@ -17,3 +17,6 @@ env_logger = "0.11.5" itertools = "0.13" spmc = "0.3.0" url = "2" + +[package.metadata.release] +release = false diff --git a/kernel/examples/read-table-multi-threaded/src/main.rs b/kernel/examples/read-table-multi-threaded/src/main.rs index d97b6c2d3..e689a4ef4 100644 --- a/kernel/examples/read-table-multi-threaded/src/main.rs +++ b/kernel/examples/read-table-multi-threaded/src/main.rs @@ -104,7 +104,7 @@ fn truncate_batch(batch: RecordBatch, rows: usize) -> RecordBatch { RecordBatch::try_new(batch.schema(), cols).unwrap() } -// This is the callback that will be called fo each valid scan row +// This is the callback that will be called for each valid scan row fn send_scan_file( scan_tx: &mut spmc::Sender, path: &str, @@ -125,7 +125,7 @@ fn send_scan_file( fn try_main() -> DeltaResult<()> { let cli = Cli::parse(); - // build a table and get the lastest snapshot from it + // build a table and get the latest snapshot from it let table = Table::try_from_uri(&cli.path)?; println!("Reading {}", table.location()); @@ -279,7 +279,7 @@ fn do_work( // this example uses the parquet_handler from the engine, but an engine could // choose to use whatever method it might want to read a parquet file. The reader - // could, for example, fill in the parition columns, or apply deletion vectors. Here + // could, for example, fill in the partition columns, or apply deletion vectors. Here // we assume a more naive parquet reader and fix the data up after the fact. // further parallelism would also be possible here as we could read the parquet file // in chunks where each thread reads one chunk. The engine would need to ensure diff --git a/kernel/examples/read-table-single-threaded/Cargo.toml b/kernel/examples/read-table-single-threaded/Cargo.toml index 6e0dc147a..dc0458139 100644 --- a/kernel/examples/read-table-single-threaded/Cargo.toml +++ b/kernel/examples/read-table-single-threaded/Cargo.toml @@ -16,3 +16,6 @@ delta_kernel = { path = "../../../kernel", features = [ env_logger = "0.11.5" itertools = "0.13" url = "2" + +[package.metadata.release] +release = false diff --git a/kernel/examples/read-table-single-threaded/src/main.rs b/kernel/examples/read-table-single-threaded/src/main.rs index 32ad3173d..9bbc9476d 100644 --- a/kernel/examples/read-table-single-threaded/src/main.rs +++ b/kernel/examples/read-table-single-threaded/src/main.rs @@ -69,7 +69,7 @@ fn main() -> ExitCode { fn try_main() -> DeltaResult<()> { let cli = Cli::parse(); - // build a table and get the lastest snapshot from it + // build a table and get the latest snapshot from it let table = Table::try_from_uri(&cli.path)?; println!("Reading {}", table.location()); diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 7d4be1a82..0cd12ce50 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -367,7 +367,7 @@ impl RowVisitor for CdcVisitor { pub type SetTransactionMap = HashMap; -/// Extact application transaction actions from the log into a map +/// Extract application transaction actions from the log into a map /// /// This visitor maintains the first entry for each application id it /// encounters. When a specific application id is required then diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 4700b72c0..a3e184574 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -55,9 +55,9 @@ macro_rules! prim_array_cmp { pub(crate) use prim_array_cmp; -/// Get the indicies in `parquet_schema` of the specified columns in `requested_schema`. This -/// returns a tuples of (mask_indicies: Vec, reorder_indicies: -/// Vec). `mask_indicies` is used for generating the mask for reading from the +/// Get the indices in `parquet_schema` of the specified columns in `requested_schema`. This +/// returns a tuples of (mask_indices: Vec, reorder_indices: +/// Vec). `mask_indices` is used for generating the mask for reading from the pub(crate) fn make_arrow_error(s: impl Into) -> Error { Error::Arrow(arrow_schema::ArrowError::InvalidArgumentError(s.into())).with_backtrace() } diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs index 1912a7b34..ab296e12a 100644 --- a/kernel/src/engine/default/json.rs +++ b/kernel/src/engine/default/json.rs @@ -29,7 +29,7 @@ pub struct DefaultJsonHandler { store: Arc, /// The executor to run async tasks on task_executor: Arc, - /// The maximun number of batches to read ahead + /// The maximum number of batches to read ahead readahead: usize, /// The number of rows to read per batch batch_size: usize, diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index 1acc4ef4a..a65d329a2 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -258,7 +258,7 @@ impl FileOpener for ParquetOpener { let mut reader = ParquetObjectReader::new(store, meta); let metadata = ArrowReaderMetadata::load_async(&mut reader, Default::default()).await?; let parquet_schema = metadata.schema(); - let (indicies, requested_ordering) = + let (indices, requested_ordering) = get_requested_indices(&table_schema, parquet_schema)?; let options = ArrowReaderOptions::new(); //.with_page_index(enable_page_index); let mut builder = @@ -267,7 +267,7 @@ impl FileOpener for ParquetOpener { &table_schema, parquet_schema, builder.parquet_schema(), - &indicies, + &indices, ) { builder = builder.with_projection(mask) } @@ -330,7 +330,7 @@ impl FileOpener for PresignedUrlOpener { let reader = client.get(file_meta.location).send().await?.bytes().await?; let metadata = ArrowReaderMetadata::load(&reader, Default::default())?; let parquet_schema = metadata.schema(); - let (indicies, requested_ordering) = + let (indices, requested_ordering) = get_requested_indices(&table_schema, parquet_schema)?; let options = ArrowReaderOptions::new(); @@ -340,7 +340,7 @@ impl FileOpener for PresignedUrlOpener { &table_schema, parquet_schema, builder.parquet_schema(), - &indicies, + &indices, ) { builder = builder.with_projection(mask) } diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index 2a54e2e86..260ef321b 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -21,9 +21,8 @@ fn try_create_from_parquet( let metadata = ArrowReaderMetadata::load(&file, Default::default())?; let parquet_schema = metadata.schema(); let mut builder = ParquetRecordBatchReaderBuilder::try_new(file)?; - let (indicies, requested_ordering) = get_requested_indices(&schema, parquet_schema)?; - if let Some(mask) = generate_mask(&schema, parquet_schema, builder.parquet_schema(), &indicies) - { + let (indices, requested_ordering) = get_requested_indices(&schema, parquet_schema)?; + if let Some(mask) = generate_mask(&schema, parquet_schema, builder.parquet_schema(), &indices) { builder = builder.with_projection(mask); } if let Some(predicate) = predicate { diff --git a/kernel/src/engine_data.rs b/kernel/src/engine_data.rs index e421d0ad6..25a7e84bd 100644 --- a/kernel/src/engine_data.rs +++ b/kernel/src/engine_data.rs @@ -199,7 +199,7 @@ pub trait RowVisitor { /// "getter" of type [`GetData`] will be present. This can be used to actually get at the data /// for each row. You can `use` the `TypedGetData` trait if you want to have a way to extract /// typed data that will fail if the "getter" is for an unexpected type. The data in `getters` - /// does not outlive the call to this funtion (i.e. it should be copied if needed). + /// does not outlive the call to this function (i.e. it should be copied if needed). fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()>; /// Visit the rows of an [`EngineData`], selecting the leaf column names given by diff --git a/kernel/src/error.rs b/kernel/src/error.rs index e3230aeb9..815ef3e51 100644 --- a/kernel/src/error.rs +++ b/kernel/src/error.rs @@ -1,4 +1,4 @@ -//! Defintions of errors that the delta kernel can encounter +//! Definitions of errors that the delta kernel can encounter use std::{ backtrace::{Backtrace, BacktraceStatus}, @@ -58,7 +58,7 @@ pub enum Error { #[error("Internal error {0}. This is a kernel bug, please report.")] InternalError(String), - /// An error enountered while working with parquet data + /// An error encountered while working with parquet data #[cfg(feature = "parquet")] #[error("Arrow error: {0}")] Parquet(#[from] parquet::errors::ParquetError), @@ -99,7 +99,7 @@ pub enum Error { #[error("No table version found.")] MissingVersion, - /// An error occured while working with deletion vectors + /// An error occurred while working with deletion vectors #[error("Deletion Vector error: {0}")] DeletionVector(String), diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index bad20aea4..620142679 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -737,7 +737,7 @@ mod tests { ), ]); - // Similer to ExpressionDepthChecker::check, but also returns call count + // Similar to ExpressionDepthChecker::check, but also returns call count let check_with_call_count = |depth_limit| ExpressionDepthChecker::check_with_call_count(&expr, depth_limit); diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 5283c08c5..2ce2fd41a 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -393,7 +393,7 @@ impl PrimitiveType { // Timestamps may additionally be encoded as a ISO 8601 formatted string such as // `1970-01-01T00:00:00.123456Z`. // - // The difference arrises mostly in how they are to be handled on the engine side - i.e. timestampNTZ + // The difference arises mostly in how they are to be handled on the engine side - i.e. timestampNTZ // is not adjusted to UTC, this is just so we can (de-)serialize it as a date sting. // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#partition-value-serialization TimestampNtz | Timestamp => { diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index f27907bcd..fa88e7afa 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -202,7 +202,7 @@ impl FileMeta { /// let b: Arc = a.downcast().unwrap(); /// ``` /// -/// In contrast, very similer code that relies only on `Any` would fail to compile: +/// In contrast, very similar code that relies only on `Any` would fail to compile: /// /// ```fail_compile /// # use std::any::Any; @@ -404,7 +404,7 @@ pub trait JsonHandler: AsAny { /// /// - `path` - URL specifying the location to write the JSON file /// - `data` - Iterator of EngineData to write to the JSON file. Each row should be written as - /// a new JSON object appended to the file. (that is, the file is newline-delimeted JSON, and + /// a new JSON object appended to the file. (that is, the file is newline-delimited JSON, and /// each row is a JSON object on a single line) /// - `overwrite` - If true, overwrite the file if it exists. If false, the call must fail if /// the file exists. diff --git a/kernel/src/predicates/parquet_stats_skipping/tests.rs b/kernel/src/predicates/parquet_stats_skipping/tests.rs index b1de88e6b..50833a166 100644 --- a/kernel/src/predicates/parquet_stats_skipping/tests.rs +++ b/kernel/src/predicates/parquet_stats_skipping/tests.rs @@ -299,7 +299,7 @@ fn test_sql_where() { "WHERE {TRUE} < {FALSE}" ); - // Constrast normal vs SQL WHERE semantics - comparison + // Contrast normal vs SQL WHERE semantics - comparison expect_eq!( AllNullTestFilter.eval_expr(&Expr::lt(col.clone(), VAL), false), None, @@ -321,7 +321,7 @@ fn test_sql_where() { "WHERE {VAL} < {col}" ); - // Constrast normal vs SQL WHERE semantics - comparison inside AND + // Contrast normal vs SQL WHERE semantics - comparison inside AND expect_eq!( AllNullTestFilter.eval_expr(&Expr::and(NULL, Expr::lt(col.clone(), VAL)), false), None, diff --git a/kernel/src/predicates/tests.rs b/kernel/src/predicates/tests.rs index ce273e7b8..fa4aec191 100644 --- a/kernel/src/predicates/tests.rs +++ b/kernel/src/predicates/tests.rs @@ -51,7 +51,7 @@ fn test_default_eval_scalar() { } } -// verifies that partial orderings behave as excpected for all Scalar types +// verifies that partial orderings behave as expected for all Scalar types #[test] fn test_default_partial_cmp_scalars() { use Ordering::*; diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index 54eb5344c..847855d4a 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -24,7 +24,7 @@ mod tests; /// Returns `None` if the predicate is not eligible for data skipping. /// /// We normalize each binary operation to a comparison between a column and a literal value and -/// rewite that in terms of the min/max values of the column. +/// rewrite that in terms of the min/max values of the column. /// For example, `1 < a` is rewritten as `minValues.a > 1`. /// /// For Unary `Not`, we push the Not down using De Morgan's Laws to invert everything below the Not. diff --git a/kernel/src/schema.rs b/kernel/src/schema.rs index 42901751f..9cb6769f9 100644 --- a/kernel/src/schema.rs +++ b/kernel/src/schema.rs @@ -1156,7 +1156,7 @@ mod tests { ), ]); - // Similer to SchemaDepthChecker::check, but also returns call count + // Similar to SchemaDepthChecker::check, but also returns call count let check_with_call_count = |depth_limit| SchemaDepthChecker::check_with_call_count(&schema, depth_limit); diff --git a/kernel/src/transaction.rs b/kernel/src/transaction.rs index c6e93ea7b..c73782f64 100644 --- a/kernel/src/transaction.rs +++ b/kernel/src/transaction.rs @@ -241,7 +241,7 @@ impl WriteContext { /// Result after committing a transaction. If 'committed', the version is the new version written /// to the log. If 'conflict', the transaction is returned so the caller can resolve the conflict /// (along with the version which conflicted). -// TODO(zach): in order to make the returning of a transcation useful, we need to add APIs to +// TODO(zach): in order to make the returning of a transaction useful, we need to add APIs to // update the transaction to a new version etc. #[derive(Debug)] pub enum CommitResult { diff --git a/kernel/tests/golden_tables.rs b/kernel/tests/golden_tables.rs index 1d0c8406b..120271ef2 100644 --- a/kernel/tests/golden_tables.rs +++ b/kernel/tests/golden_tables.rs @@ -26,7 +26,7 @@ use delta_kernel::engine::default::DefaultEngine; mod common; use common::{load_test_data, to_arrow}; -// NB adapated from DAT: read all parquet files in the directory and concatenate them +// NB adapted from DAT: read all parquet files in the directory and concatenate them async fn read_expected(path: &Path) -> DeltaResult { let store = Arc::new(LocalFileSystem::new_with_prefix(path)?); let files = store.list(None).try_collect::>().await?; @@ -89,7 +89,7 @@ fn sort_record_batch(batch: RecordBatch) -> DeltaResult { Ok(RecordBatch::try_new(batch.schema(), columns)?) } -// Ensure that two sets of fields have the same names, and dict_id/ordering. +// Ensure that two sets of fields have the same names, and dict_is_ordered // We ignore: // - data type: This is checked already in `assert_columns_match` // - nullability: parquet marks many things as nullable that we don't in our schema @@ -103,10 +103,6 @@ fn assert_fields_match<'a>( actual_field.name() == expected_field.name(), "Field names don't match" ); - assert!( - actual_field.dict_id() == expected_field.dict_id(), - "Field dict_id doesn't match" - ); assert!( actual_field.dict_is_ordered() == expected_field.dict_is_ordered(), "Field dict_is_ordered doesn't match" @@ -372,7 +368,7 @@ golden_test!("deltalog-getChanges", latest_snapshot_test); golden_test!("dv-partitioned-with-checkpoint", latest_snapshot_test); golden_test!("dv-with-columnmapping", latest_snapshot_test); -skip_test!("hive": "test not yet implmented - different file structure"); +skip_test!("hive": "test not yet implemented - different file structure"); golden_test!("kernel-timestamp-int96", latest_snapshot_test); golden_test!("kernel-timestamp-pst", latest_snapshot_test); golden_test!("kernel-timestamp-timestamp_micros", latest_snapshot_test); @@ -440,11 +436,11 @@ skip_test!("canonicalized-paths-special-b": "BUG: path canonicalization"); // // We added two add files with the same path `foo`. The first should have been removed. // // The second should remain, and should have a hard-coded modification time of 1700000000000L // assert(foundFiles.find(_.getPath.endsWith("foo")).exists(_.getModificationTime == 1700000000000L)) -skip_test!("delete-re-add-same-file-different-transactions": "test not yet implmented"); +skip_test!("delete-re-add-same-file-different-transactions": "test not yet implemented"); // data file doesn't exist, get the relative path to compare // assert(new File(addFileStatus.getPath).getName == "special p@#h") -skip_test!("log-replay-special-characters-b": "test not yet implmented"); +skip_test!("log-replay-special-characters-b": "test not yet implemented"); negative_test!("deltalog-invalid-protocol-version"); negative_test!("deltalog-state-reconstruction-from-checkpoint-missing-metadata"); diff --git a/release.sh b/release.sh new file mode 100755 index 000000000..b16605c9d --- /dev/null +++ b/release.sh @@ -0,0 +1,186 @@ +#!/usr/bin/env bash + +################################################################################################### +# USAGE: +# 1. on a release branch: ./release.sh (example: ./release.sh 0.1.0) +# 2. on main branch (after merging release branch): ./release.sh +################################################################################################### + +# This is a script to automate a large portion of the release process for the crates we publish to +# crates.io. Currently only `delta_kernel` (in the kernel/ dir) and `delta_kernel_derive` (in the +# derive-macros/ dir) are released. + +# Exit on error, undefined variables, and pipe failures +set -euo pipefail + +# print commands before executing them for debugging +# set -x + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # no color + +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } +log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; } + +check_requirements() { + log_info "Checking required tools..." + + command -v cargo >/dev/null 2>&1 || log_error "cargo is required but not installed" + command -v git >/dev/null 2>&1 || log_error "git is required but not installed" + command -v cargo-release >/dev/null 2>&1 || log_error "cargo-release is required but not installed. Install with: cargo install cargo-release" + command -v git-cliff >/dev/null 2>&1 || log_error "git-cliff is required but not installed. Install with: cargo install git-cliff" + command -v jq >/dev/null 2>&1 || log_error "jq is required but not installed." + + log_success "All required tools are available" +} + +is_main_branch() { + local current_branch + current_branch=$(git rev-parse --abbrev-ref HEAD) + [[ "$current_branch" == "main" ]] +} + +is_working_tree_clean() { + git diff --quiet && git diff --cached --quiet +} + +# check if the version is already published on crates.io +is_version_published() { + local crate_name="$1" + local version + version=$(get_current_version "$crate_name") + + if [[ -z "$version" ]]; then + log_error "Could not find crate '$crate_name' in workspace" + fi + + if cargo search "$crate_name" | grep -q "^$crate_name = \"$version\""; then + return 0 + else + return 1 + fi +} + +# get current version from Cargo.toml +get_current_version() { + local crate_name="$1" + cargo metadata --no-deps --format-version 1 | \ + jq -r --arg name "$crate_name" '.packages[] | select(.name == $name) | .version' +} + +# Prompt user for confirmation +confirm() { + local prompt="$1" + local response + + echo -e -n "${YELLOW}${prompt} [y/N]${NC} " + read -r response + + [[ "$response" =~ ^[Yy] ]] +} + +# handle release branch workflow (CHANGELOG updates, README updates, PR to main) +handle_release_branch() { + local version="$1" + + log_info "Starting release preparation for version $version..." + + # Update CHANGELOG and README + log_info "Updating CHANGELOG.md and README.md..." + if ! cargo release --workspace "$version" --no-publish --no-push --no-tag --execute; then + log_error "Failed to update CHANGELOG and README" + fi + + if confirm "Print diff of CHANGELOG/README changes?"; then + git diff --stat HEAD^ + git diff HEAD^ + fi + + if confirm "Would you like to push these changes to 'origin' remote?"; then + local current_branch + current_branch=$(git rev-parse --abbrev-ref HEAD) + + log_info "Pushing changes to remote..." + git push origin "$current_branch" + + if confirm "Would you like to create a PR to merge this release into 'main'?"; then + if command -v gh >/dev/null 2>&1; then + gh pr create --title "release $version" --body "release $version" + log_success "PR created successfully" + else + log_warning "GitHub CLI not found. Please create a PR manually." + fi + fi + fi +} + +# Handle main branch workflow (publish and tag) +handle_main_branch() { + # could potentially just use full 'cargo release' command here + publish "delta_kernel_derive" + publish "delta_kernel" + + # hack: just redo getting the version + local version + version=$(get_current_version "delta_kernel") + + if confirm "Would you like to tag this release?"; then + log_info "Tagging release $version..." + if confirm "Tagging as v$version. continue?"; then + git tag -a "v$version" -m "Release v$version" + git push upstream tag "v$version" + log_success "Tagged release $version" + fi + fi +} + +publish() { + local crate_name="$1" + local current_version + current_version=$(get_current_version "$crate_name") + + if is_version_published "$crate_name"; then + log_error "$crate_name version $current_version is already published to crates.io" + fi + log_info "[DRY RUN] Publishing $crate_name version $current_version to crates.io..." + if ! cargo publish --dry-run -p "$crate_name"; then + log_error "Failed to publish $crate_name to crates.io" + fi + + if confirm "Dry run complete. Continue with publishing?"; then + log_info "Publishing $crate_name version $current_version to crates.io..." + if ! cargo publish -p "$crate_name"; then + log_error "Failed to publish $crate_name to crates.io" + fi + log_success "Successfully published $crate_name version $current_version to crates.io" + fi +} + + +validate_version() { + local version=$1 + # Check if version starts with a number + if [[ ! $version =~ ^[0-9] ]]; then + log_error "Version must start with a number (e.g., '0.1.1'). Got: '$version'" + fi +} + +check_requirements + +if is_main_branch; then + if [[ $# -ne 0 ]]; then + log_error "Version argument not expected on main branch\nUsage: $0" + fi + handle_main_branch +else + if [[ $# -ne 1 ]]; then + log_error "Version argument required when on release branch\nUsage: $0 " + fi + validate_version "$1" + handle_release_branch "$1" +fi diff --git a/release.toml b/release.toml new file mode 100644 index 000000000..b04bd6b47 --- /dev/null +++ b/release.toml @@ -0,0 +1,3 @@ +tag = false +publish = false +pre-release-commit-message = "release {{version}}" diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml index 1bc5e41f8..0a90e96ed 100644 --- a/test-utils/Cargo.toml +++ b/test-utils/Cargo.toml @@ -8,6 +8,9 @@ repository.workspace = true readme.workspace = true version.workspace = true +[package.metadata.release] +release = false + [dependencies] arrow-array = { workspace = true, features = ["chrono-tz"] } arrow-schema = { workspace = true }