From 5e5c299a0cd748536d4d70fa9f88fb5b4b877bb6 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 28 Oct 2024 12:39:16 -0700 Subject: [PATCH] Add visitor for converting kernel expressions to engine expressions (#363) ## Rationale for the change Engines may need to apply expressions that originate in the kernel. This can include filter predicates such as filters for protocol and metadata actions in a log file. Another example is logical to physical transformations that the engine must apply, for instance to materialize partition columns. This PR introduces a visitor framework for kernel expressions to be converted to engine expressions so that engines can apply their own evaluators, predicate pushdowns, and optimizations on these expressions. Closes: #358 ## What changes are included in this PR? 5 major changes are made in this PR: 1) A new module `ffi/src/expressions/kernel.rs` is added. This module is responsible for converting kernel `Expression`s to the engine's representation of expressions through an ffi-compatible visitor. The visitor supports binary operations, scalars, struct expressions, variadic and unary operators, and columns. All the engine has to do is implement an `EngineExpressionVisitor` to support the conversion. 2) The `EnginePredicate` to kernel `Expression`visitor code is moved to `ffi/src/expressions/engine.rs`. This is done to keep all ffi expression code under the `ffi/src/expressions` module. 3) This PR also allows engines to get `SharedExpression`s, which are handles to the kernel's `Expression` type. These are given back to the kernel to specify which expression the engine would like to visit. 4) A new ffi example is added to showcase how the `EngineExpressionVisitor` can be used to construct expressions. This is a C implementation that can be found in `ffi/examples/visit-expression/`. The example only prints an expression that it receives from the kernel and asserts that it matches an expected output. 5) A new testing feature flag `test-ffi` is added to the `ffi` crate. This feature flag is used to expose testing functions found in `ffi/src/test_ffi`. This module is currently used to return a kernel expression `get_testing_kernel_expression` used in the example c implementation. ## Are these changes tested? Two tests are added: - I introduced a new test to validate the expression visitor functions correctly and converts data as expected. This simply prints out the expression tree structure and compares it to an expected result. The kernel expression can be found in `ffi/src/test_ffi.rs`, and the expected output is in `ffi/tests/test-expression-visitor/expected.txt` - There is also a test that checks for memory leaks in the expression visitor using valgrind. ## Are there any user-facing changes? This PR introduces a public facing API for constructing expressions and visiting them. It also adds `delta_kernel_derive` as a public export of `kernel` crate. This PR does not break existing APIs. --- .github/workflows/build.yml | 13 +- ffi/Cargo.toml | 1 + ffi/examples/visit-expression/CMakeLists.txt | 33 ++ ffi/examples/visit-expression/expression.h | 440 ++++++++++++++++++ .../visit-expression/expression_print.h | 206 ++++++++ .../visit-expression/visit_expression.c | 12 + .../{expressions.rs => expressions/engine.rs} | 2 + ffi/src/expressions/kernel.rs | 367 +++++++++++++++ ffi/src/expressions/mod.rs | 10 + ffi/src/lib.rs | 2 + ffi/src/scan.rs | 4 +- ffi/src/test_ffi.rs | 95 ++++ .../test-expression-visitor/expected.txt | 78 ++++ ffi/tests/test-expression-visitor/run_test.sh | 12 + .../engine/parquet_stats_skipping/tests.rs | 4 +- kernel/src/expressions/column_names.rs | 2 +- kernel/src/expressions/scalars.rs | 5 +- kernel/src/lib.rs | 1 + 18 files changed, 1279 insertions(+), 8 deletions(-) create mode 100644 ffi/examples/visit-expression/CMakeLists.txt create mode 100644 ffi/examples/visit-expression/expression.h create mode 100644 ffi/examples/visit-expression/expression_print.h create mode 100644 ffi/examples/visit-expression/visit_expression.c rename ffi/src/{expressions.rs => expressions/engine.rs} (97%) create mode 100644 ffi/src/expressions/kernel.rs create mode 100644 ffi/src/expressions/mod.rs create mode 100644 ffi/src/test_ffi.rs create mode 100644 ffi/tests/test-expression-visitor/expected.txt create mode 100755 ffi/tests/test-expression-visitor/run_test.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c977fde7a..9795be5dd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -86,6 +86,7 @@ jobs: sudo apt update sudo apt install -y -V libarrow-dev # For C++ sudo apt install -y -V libarrow-glib-dev # For GLib (C) + sudo apt install -y -V valgrind # For memory leak test elif [ "$RUNNER_OS" == "macOS" ]; then brew install apache-arrow brew install apache-arrow-glib @@ -108,9 +109,9 @@ jobs: cargo build popd pushd ffi - cargo b --features default-engine,sync-engine + cargo b --features default-engine,sync-engine,test-ffi popd - - name: build and run test + - name: build and run read-table test run: | pushd ffi/examples/read-table mkdir build @@ -118,6 +119,14 @@ jobs: cmake .. make make test + - name: build and run visit-expression test + run: | + pushd ffi/examples/visit-expression + mkdir build + pushd build + cmake .. + make + make test coverage: runs-on: ubuntu-latest diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml index 9722eb8f3..1bd69e708 100644 --- a/ffi/Cargo.toml +++ b/ffi/Cargo.toml @@ -47,3 +47,4 @@ default-engine = [ ] sync-engine = ["delta_kernel/sync-engine"] developer-visibility = [] +test-ffi = [] diff --git a/ffi/examples/visit-expression/CMakeLists.txt b/ffi/examples/visit-expression/CMakeLists.txt new file mode 100644 index 000000000..1b89deec6 --- /dev/null +++ b/ffi/examples/visit-expression/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.12) +project(visit_expressions) + +add_executable(visit_expression visit_expression.c) +target_compile_definitions(visit_expression PUBLIC DEFINE_DEFAULT_ENGINE) +target_include_directories(visit_expression PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../../../target/ffi-headers") +target_link_directories(visit_expression PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../../../target/debug") +target_link_libraries(visit_expression PUBLIC delta_kernel_ffi) +target_compile_options(visit_expression PUBLIC) + +target_compile_options(visit_expression PRIVATE -Wall -Wextra -Wpedantic -Werror -Wno-strict-prototypes) + +# Get info on the OS and platform +if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + set(MACOSX TRUE) +endif() +if(UNIX AND NOT APPLE) + set(LINUX TRUE) +endif() + +# Add the kernel expresion -> engine expression test +include(CTest) +set(ExprTestRunner "../../../tests/test-expression-visitor/run_test.sh") +set(ExprExpectedPath "../../../tests/test-expression-visitor/expected.txt") +add_test(NAME test_expression_visitor COMMAND ${ExprTestRunner} ${ExprExpectedPath}) +if(LINUX) + add_test(NAME test_expression_visitor_leaks COMMAND valgrind + --error-exitcode=1 + --tool=memcheck + --leak-check=full + --errors-for-leak-kinds=definite + --show-leak-kinds=definite ./visit_expression) +endif() diff --git a/ffi/examples/visit-expression/expression.h b/ffi/examples/visit-expression/expression.h new file mode 100644 index 000000000..302566d10 --- /dev/null +++ b/ffi/examples/visit-expression/expression.h @@ -0,0 +1,440 @@ +#pragma once + +#include "delta_kernel_ffi.h" +#include +#include +#include +#include +#include +#include + +/** + * This module defines a very simple model of an expression, used only to be able to print the + * provided expression. It consists of an "ExpressionBuilder" which is our user data that gets + * passed into each visit_x call. This simply keeps track of all the lists we are asked to allocate. + * + * Each "ExpressionItem" tracks the type and pointer to the expression. + * + * Each complex type is made of an "ExpressionItemList", which tracks its length and an array of + * "ExpressionItems" that make up the complex type. The top level expression is in a length 1 + * "ExpressionItemList". + */ + +/************************************************************* + * Data Types + ************************************************************/ +enum OpType { + Add, + Minus, + Divide, + Multiply, + LessThan, + LessThanOrEqual, + GreaterThan, + GreaterThaneOrEqual, + Equal, + NotEqual, + Distinct, + In, + NotIn, +}; +enum LitType { + Integer, + Long, + Short, + Byte, + Float, + Double, + String, + Boolean, + Timestamp, + TimestampNtz, + Date, + Binary, + Decimal, + Null, + Struct, + Array +}; +enum ExpressionType { BinOp, Variadic, Literal, Unary, Column }; +enum VariadicType { + And, + Or, + StructExpression, +}; +enum UnaryType { Not, IsNull }; +typedef struct { + void* ref; + enum ExpressionType type; +} ExpressionItem; +typedef struct { + uint32_t len; + ExpressionItem* list; +} ExpressionItemList; +struct BinOp { + enum OpType op; + ExpressionItemList exprs; +}; +struct Variadic { + enum VariadicType op; + ExpressionItemList exprs; +}; +struct Unary { + enum UnaryType type; + ExpressionItemList sub_expr; +}; +struct BinaryData { + uint8_t* buf; + uintptr_t len; +}; +struct Decimal { + uint64_t value[2]; + uint8_t precision; + uint8_t scale; +}; +typedef struct { + size_t list_count; + ExpressionItemList* lists; +} ExpressionBuilder; +struct Struct { + ExpressionItemList fields; + ExpressionItemList values; +}; +struct ArrayData { + ExpressionItemList exprs; +}; +struct Literal { + enum LitType type; + union LiteralValue { + int32_t integer_data; + int64_t long_data; + int16_t short_data; + int8_t byte_data; + float float_data; + double double_data; + bool boolean_data; + char* string_data; + struct Struct struct_data; + struct ArrayData array_data; + struct BinaryData binary; + struct Decimal decimal; + } value; +}; + +/************************************************************* + * Utility functions + ************************************************************/ +void put_expr_item(void* data, size_t sibling_list_id, void* ref, enum ExpressionType type) { + ExpressionBuilder* data_ptr = (ExpressionBuilder*)data; + ExpressionItem expr = { .ref = ref, .type = type }; + ExpressionItemList* list = &data_ptr->lists[sibling_list_id]; + list->list[list->len++] = expr; +} +ExpressionItemList get_expr_list(void* data, size_t list_id) { + ExpressionBuilder* data_ptr = (ExpressionBuilder*)data; + assert(list_id < data_ptr->list_count); + return data_ptr->lists[list_id]; +} +// utility to turn a slice into a char* +char* allocate_string(const KernelStringSlice slice) { + return strndup(slice.ptr, slice.len); +} + +/************************************************************* + * Binary Operations + ************************************************************/ +#define DEFINE_BINOP(fun_name, op) \ + void fun_name(void* data, uintptr_t sibling_list_id, uintptr_t child_list_id) { \ + visit_expr_binop(data, sibling_list_id, op, child_list_id); \ + } +void visit_expr_binop(void* data, + uintptr_t sibling_id_list, + enum OpType op, + uintptr_t child_id_list) { + struct BinOp* binop = malloc(sizeof(struct BinOp)); + binop->op = op; + binop->exprs = get_expr_list(data, child_id_list); + put_expr_item(data, sibling_id_list, binop, BinOp); +} +DEFINE_BINOP(visit_expr_add, Add) +DEFINE_BINOP(visit_expr_minus, Minus) +DEFINE_BINOP(visit_expr_multiply, Multiply) +DEFINE_BINOP(visit_expr_divide, Divide) +DEFINE_BINOP(visit_expr_lt, LessThan) +DEFINE_BINOP(visit_expr_le, LessThanOrEqual) +DEFINE_BINOP(visit_expr_gt, GreaterThan) +DEFINE_BINOP(visit_expr_ge, GreaterThaneOrEqual) +DEFINE_BINOP(visit_expr_eq, Equal) +DEFINE_BINOP(visit_expr_ne, NotEqual) +DEFINE_BINOP(visit_expr_distinct, Distinct) +DEFINE_BINOP(visit_expr_in, In) +DEFINE_BINOP(visit_expr_not_in, NotIn) +#undef DEFINE_BINOP + +/************************************************************* + * Literal Values + ************************************************************/ +#define DEFINE_SIMPLE_SCALAR(fun_name, enum_member, c_type, literal_field) \ + void fun_name(void* data, uintptr_t sibling_list_id, c_type val) { \ + struct Literal* lit = malloc(sizeof(struct Literal)); \ + lit->type = enum_member; \ + lit->value.literal_field = val; \ + put_expr_item(data, sibling_list_id, lit, Literal); \ + } \ + _Static_assert(sizeof(c_type) <= sizeof(uintptr_t), \ + "The provided type is not a valid simple scalar") +DEFINE_SIMPLE_SCALAR(visit_expr_int_literal, Integer, int32_t, integer_data); +DEFINE_SIMPLE_SCALAR(visit_expr_long_literal, Long, int64_t, long_data); +DEFINE_SIMPLE_SCALAR(visit_expr_short_literal, Short, int16_t, short_data); +DEFINE_SIMPLE_SCALAR(visit_expr_byte_literal, Byte, int8_t, byte_data); +DEFINE_SIMPLE_SCALAR(visit_expr_float_literal, Float, float, float_data); +DEFINE_SIMPLE_SCALAR(visit_expr_double_literal, Double, double, double_data); +DEFINE_SIMPLE_SCALAR(visit_expr_boolean_literal, Boolean, _Bool, boolean_data); +DEFINE_SIMPLE_SCALAR(visit_expr_timestamp_literal, Timestamp, int64_t, long_data); +DEFINE_SIMPLE_SCALAR(visit_expr_timestamp_ntz_literal, TimestampNtz, int64_t, long_data); +DEFINE_SIMPLE_SCALAR(visit_expr_date_literal, Date, int32_t, integer_data); +#undef DEFINE_SIMPLE_SCALAR + +void visit_expr_string_literal(void* data, uintptr_t sibling_list_id, KernelStringSlice string) { + struct Literal* literal = malloc(sizeof(struct Literal)); + literal->type = String; + literal->value.string_data = allocate_string(string); + put_expr_item(data, sibling_list_id, literal, Literal); +} +void visit_expr_decimal_literal(void* data, + uintptr_t sibling_list_id, + uint64_t value_ms, + uint64_t value_ls, + uint8_t precision, + uint8_t scale) { + struct Literal* literal = malloc(sizeof(struct Literal)); + literal->type = Decimal; + struct Decimal* dec = &literal->value.decimal; + dec->value[0] = value_ms; + dec->value[1] = value_ls; + dec->precision = precision; + dec->scale = scale; + put_expr_item(data, sibling_list_id, literal, Literal); +} +void visit_expr_binary_literal(void* data, + uintptr_t sibling_list_id, + const uint8_t* buf, + uintptr_t len) { + struct Literal* literal = malloc(sizeof(struct Literal)); + literal->type = Binary; + struct BinaryData* bin = &literal->value.binary; + bin->buf = malloc(len); + bin->len = len; + memcpy(bin->buf, buf, len); + put_expr_item(data, sibling_list_id, literal, Literal); +} +void visit_expr_struct_literal(void* data, + uintptr_t sibling_list_id, + uintptr_t child_field_list_id, + uintptr_t child_value_list_id) { + struct Literal* literal = malloc(sizeof(struct Literal)); + literal->type = Struct; + struct Struct* struct_data = &literal->value.struct_data; + struct_data->fields = get_expr_list(data, child_field_list_id); + struct_data->values = get_expr_list(data, child_value_list_id); + put_expr_item(data, sibling_list_id, literal, Literal); +} +void visit_expr_null_literal(void* data, uintptr_t sibling_id_list) { + struct Literal* literal = malloc(sizeof(struct Literal)); + literal->type = Null; + put_expr_item(data, sibling_id_list, literal, Literal); +} + +/************************************************************* + * Variadic Expressions + ************************************************************/ +#define DEFINE_VARIADIC(fun_name, enum_member) \ + void fun_name(void* data, uintptr_t sibling_list_id, uintptr_t child_list_id) { \ + visit_expr_variadic(data, sibling_list_id, enum_member, child_list_id); \ + } + +void visit_expr_variadic(void* data, + uintptr_t sibling_list_id, + enum VariadicType op, + uintptr_t child_list_id) { + struct Variadic* var = malloc(sizeof(struct Variadic)); + var->op = op; + var->exprs = get_expr_list(data, child_list_id); + put_expr_item(data, sibling_list_id, var, Variadic); +} +DEFINE_VARIADIC(visit_expr_and, And) +DEFINE_VARIADIC(visit_expr_or, Or) +DEFINE_VARIADIC(visit_expr_struct_expr, StructExpression) +#undef DEFINE_VARIADIC + +void visit_expr_array_literal(void* data, uintptr_t sibling_list_id, uintptr_t child_list_id) { + struct Literal* literal = malloc(sizeof(struct Literal)); + literal->type = Array; + struct ArrayData* arr = &(literal->value.array_data); + arr->exprs = get_expr_list(data, child_list_id); + put_expr_item(data, sibling_list_id, literal, Literal); +} + +/************************************************************* + * Unary Expressions + ************************************************************/ +#define DEFINE_UNARY(fun_name, op) \ + void fun_name(void* data, uintptr_t sibling_list_id, uintptr_t child_list_id) { \ + visit_expr_unary(data, sibling_list_id, op, child_list_id); \ + } + +void visit_expr_unary(void* data, + uintptr_t sibling_list_id, + enum UnaryType type, + uintptr_t child_list_id) { + struct Unary* unary = malloc(sizeof(struct Unary)); + unary->type = type; + unary->sub_expr = get_expr_list(data, child_list_id); + put_expr_item(data, sibling_list_id, unary, Unary); +} +DEFINE_UNARY(visit_expr_is_null, IsNull) +DEFINE_UNARY(visit_expr_not, Not) +#undef DEFINE_UNARY + +/************************************************************* + * Column Expression + ************************************************************/ +void visit_expr_column(void* data, uintptr_t sibling_id_list, KernelStringSlice col_name) { + char* column_name = allocate_string(col_name); + put_expr_item(data, sibling_id_list, column_name, Column); +} + +/************************************************************* + * EngineExpressionVisitor Implementation + ************************************************************/ +uintptr_t make_field_list(void* data, uintptr_t reserve) { + ExpressionBuilder* builder = data; + int id = builder->list_count; + builder->list_count++; + builder->lists = realloc(builder->lists, sizeof(ExpressionItemList) * builder->list_count); + ExpressionItem* list = calloc(reserve, sizeof(ExpressionItem)); + builder->lists[id].len = 0; + builder->lists[id].list = list; + return id; +} + +ExpressionItemList construct_predicate(SharedExpression* predicate) { + ExpressionBuilder data = { 0 }; + EngineExpressionVisitor visitor = { + .data = &data, + .make_field_list = make_field_list, + .visit_literal_int = visit_expr_int_literal, + .visit_literal_long = visit_expr_long_literal, + .visit_literal_short = visit_expr_short_literal, + .visit_literal_byte = visit_expr_byte_literal, + .visit_literal_float = visit_expr_float_literal, + .visit_literal_double = visit_expr_double_literal, + .visit_literal_bool = visit_expr_boolean_literal, + .visit_literal_timestamp = visit_expr_timestamp_literal, + .visit_literal_timestamp_ntz = visit_expr_timestamp_ntz_literal, + .visit_literal_date = visit_expr_date_literal, + .visit_literal_binary = visit_expr_binary_literal, + .visit_literal_null = visit_expr_null_literal, + .visit_literal_decimal = visit_expr_decimal_literal, + .visit_literal_string = visit_expr_string_literal, + .visit_literal_struct = visit_expr_struct_literal, + .visit_literal_array = visit_expr_array_literal, + .visit_and = visit_expr_and, + .visit_or = visit_expr_or, + .visit_not = visit_expr_not, + .visit_is_null = visit_expr_is_null, + .visit_lt = visit_expr_lt, + .visit_le = visit_expr_le, + .visit_gt = visit_expr_gt, + .visit_ge = visit_expr_ge, + .visit_eq = visit_expr_eq, + .visit_ne = visit_expr_ne, + .visit_distinct = visit_expr_distinct, + .visit_in = visit_expr_in, + .visit_not_in = visit_expr_not_in, + .visit_add = visit_expr_add, + .visit_minus = visit_expr_minus, + .visit_multiply = visit_expr_multiply, + .visit_divide = visit_expr_divide, + .visit_column = visit_expr_column, + .visit_struct_expr = visit_expr_struct_expr, + }; + uintptr_t top_level_id = visit_expression(&predicate, &visitor); + ExpressionItemList top_level_expr = data.lists[top_level_id]; + free(data.lists); + return top_level_expr; +} + +void free_expression_list(ExpressionItemList list); +void free_expression_item(ExpressionItem ref) { + switch (ref.type) { + case BinOp: { + struct BinOp* op = ref.ref; + free_expression_list(op->exprs); + free(op); + break; + } + case Variadic: { + struct Variadic* var = ref.ref; + free_expression_list(var->exprs); + free(var); + break; + }; + case Literal: { + struct Literal* lit = ref.ref; + switch (lit->type) { + case Struct: { + struct Struct* struct_data = &lit->value.struct_data; + free_expression_list(struct_data->values); + free_expression_list(struct_data->fields); + break; + } + case Array: { + struct ArrayData* array = &lit->value.array_data; + free_expression_list(array->exprs); + break; + } + case String: { + free(lit->value.string_data); + break; + } + case Binary: { + struct BinaryData* binary = &lit->value.binary; + free(binary->buf); + break; + } + case Integer: + case Long: + case Short: + case Byte: + case Float: + case Double: + case Boolean: + case Timestamp: + case TimestampNtz: + case Date: + case Decimal: + case Null: + break; + } + free(lit); + break; + }; + case Unary: { + struct Unary* unary = ref.ref; + free_expression_list(unary->sub_expr); + free(unary); + break; + } + case Column: { + free(ref.ref); + break; + } + } +} +void free_expression_list(ExpressionItemList list) { + for (size_t i = 0; i < list.len; i++) { + free_expression_item(list.list[i]); + } + free(list.list); +} diff --git a/ffi/examples/visit-expression/expression_print.h b/ffi/examples/visit-expression/expression_print.h new file mode 100644 index 000000000..23c1dcb7b --- /dev/null +++ b/ffi/examples/visit-expression/expression_print.h @@ -0,0 +1,206 @@ +#pragma once + +#include "expression.h" + +/** + * This module defines a function `print_tree` to recursively print an ExpressionItem. + */ + +void print_tree_helper(ExpressionItem ref, int depth); +void print_n_spaces(int n) { + if (n == 0) + return; + printf(" "); + print_n_spaces(n - 1); +} +void print_expression_item_list(ExpressionItemList list, int depth) { + for (size_t i = 0; i < list.len; i++) { + print_tree_helper(list.list[i], depth); + } +} +void print_tree_helper(ExpressionItem ref, int depth) { + switch (ref.type) { + case BinOp: { + struct BinOp* op = ref.ref; + print_n_spaces(depth); + switch (op->op) { + case Add: { + printf("Add\n"); + break; + } + case Minus: { + printf("Minus\n"); + break; + }; + case Divide: { + printf("Divide\n"); + break; + }; + case Multiply: { + printf("Multiply\n"); + break; + }; + case LessThan: { + printf("LessThan\n"); + break; + }; + case LessThanOrEqual: { + printf("LessThanOrEqual\n"); + break; + } + case GreaterThan: { + printf("GreaterThan\n"); + break; + }; + case GreaterThaneOrEqual: { + printf("GreaterThanOrEqual\n"); + break; + }; + case Equal: { + printf("Equal\n"); + break; + }; + case NotEqual: { + printf("NotEqual\n"); + break; + }; + case In: { + printf("In\n"); + break; + }; + case NotIn: { + printf("NotIn\n"); + break; + }; break; + case Distinct: + printf("Distinct\n"); + break; + } + print_expression_item_list(op->exprs, depth + 1); + break; + } + case Variadic: { + struct Variadic* var = ref.ref; + print_n_spaces(depth); + switch (var->op) { + case And: + printf("And\n"); + break; + case Or: + printf("Or\n"); + break; + case StructExpression: + printf("StructExpression\n"); + break; + } + print_expression_item_list(var->exprs, depth + 1); + break; + } + case Literal: { + struct Literal* lit = ref.ref; + print_n_spaces(depth); + switch (lit->type) { + case Integer: + printf("Integer(%d)\n", lit->value.integer_data); + break; + case Long: + printf("Long(%lld)\n", (long long)lit->value.long_data); + break; + case Short: + printf("Short(%hd)\n", lit->value.short_data); + break; + case Byte: + printf("Byte(%hhd)\n", lit->value.byte_data); + break; + case Float: + printf("Float(%f)\n", (float)lit->value.float_data); + break; + case Double: + printf("Double(%f)\n", lit->value.double_data); + break; + case String: { + printf("String(%s)\n", lit->value.string_data); + break; + } + case Boolean: + printf("Boolean(%d)\n", lit->value.boolean_data); + break; + case Timestamp: + printf("Timestamp(%lld)\n", (long long)lit->value.long_data); + break; + case TimestampNtz: + printf("TimestampNtz(%lld)\n", (long long)lit->value.long_data); + break; + case Date: + printf("Date(%d)\n", lit->value.integer_data); + break; + case Binary: { + printf("Binary("); + for (size_t i = 0; i < lit->value.binary.len; i++) { + printf("%02x", lit->value.binary.buf[i]); + } + printf(")\n"); + break; + } + case Decimal: { + struct Decimal* dec = &lit->value.decimal; + printf("Decimal(%lld,%lld, %d, %d)\n", + (long long)dec->value[0], + (long long)dec->value[1], + dec->scale, + dec->precision); + break; + } + case Null: + printf("Null\n"); + break; + case Struct: + printf("Struct\n"); + struct Struct* struct_data = &lit->value.struct_data; + for (size_t i = 0; i < struct_data->values.len; i++) { + print_n_spaces(depth + 1); + + // Extract field name from field + ExpressionItem item = struct_data->fields.list[i]; + assert(item.type == Literal); + struct Literal* lit = item.ref; + assert(lit->type == String); + + printf("Field: %s\n", lit->value.string_data); + print_tree_helper(struct_data->values.list[i], depth + 2); + } + break; + case Array: + printf("Array\n"); + struct ArrayData* array = &lit->value.array_data; + print_expression_item_list(array->exprs, depth + 1); + break; + } + break; + } + case Unary: { + print_n_spaces(depth); + struct Unary* unary = ref.ref; + switch (unary->type) { + case Not: + printf("Not\n"); + break; + case IsNull: + printf("IsNull\n"); + break; + } + + print_expression_item_list(unary->sub_expr, depth + 1); + break; + } + case Column: + print_n_spaces(depth); + char* column_name = ref.ref; + printf("Column(%s)\n", column_name); + break; + } +} + +void print_expression(ExpressionItemList expression) { + print_expression_item_list(expression, 0); +} diff --git a/ffi/examples/visit-expression/visit_expression.c b/ffi/examples/visit-expression/visit_expression.c new file mode 100644 index 000000000..2fec1b662 --- /dev/null +++ b/ffi/examples/visit-expression/visit_expression.c @@ -0,0 +1,12 @@ +#include "delta_kernel_ffi.h" +#include "expression.h" +#include "expression_print.h" + +int main() { + SharedExpression* pred = get_testing_kernel_expression(); + ExpressionItemList expr = construct_predicate(pred); + print_expression(expr); + free_expression_list(expr); + free_kernel_predicate(pred); + return 0; +} diff --git a/ffi/src/expressions.rs b/ffi/src/expressions/engine.rs similarity index 97% rename from ffi/src/expressions.rs rename to ffi/src/expressions/engine.rs index 19c334038..722576766 100644 --- a/ffi/src/expressions.rs +++ b/ffi/src/expressions/engine.rs @@ -1,3 +1,5 @@ +//! Defines [`KernelExpressionVisitorState`]. This is a visitor that can be used by an [`EnginePredicate`] +//! to convert engine expressions into kernel expressions. use std::ffi::c_void; use crate::{ diff --git a/ffi/src/expressions/kernel.rs b/ffi/src/expressions/kernel.rs new file mode 100644 index 000000000..47e3e6dea --- /dev/null +++ b/ffi/src/expressions/kernel.rs @@ -0,0 +1,367 @@ +//! Defines [`EngineExpressionVisitor`]. This is a visitor that can be used to convert the kernel's +//! [`Expression`] to an engine's expression format. +use crate::expressions::SharedExpression; +use std::ffi::c_void; + +use crate::{handle::Handle, KernelStringSlice}; +use delta_kernel::expressions::{ + ArrayData, BinaryOperator, Expression, Scalar, StructData, UnaryOperator, VariadicOperator, +}; + +/// Free the memory the passed SharedExpression +/// +/// # Safety +/// Engine is responsible for passing a valid SharedExpression +#[no_mangle] +pub unsafe extern "C" fn free_kernel_predicate(data: Handle) { + data.drop_handle(); +} + +type VisitLiteralFn = extern "C" fn(data: *mut c_void, sibling_list_id: usize, value: T); +type VisitBinaryOpFn = + extern "C" fn(data: *mut c_void, sibling_list_id: usize, child_list_id: usize); +type VisitVariadicFn = + extern "C" fn(data: *mut c_void, sibling_list_id: usize, child_list_id: usize); +type VisitUnaryFn = extern "C" fn(data: *mut c_void, sibling_list_id: usize, child_list_id: usize); + +/// The [`EngineExpressionVisitor`] defines a visitor system to allow engines to build their own +/// representation of a kernel expression. +/// +/// The model is list based. When the kernel needs a list, it will ask engine to allocate one of a +/// particular size. Once allocated the engine returns an `id`, which can be any integer identifier +/// ([`usize`]) the engine wants, and will be passed back to the engine to identify the list in the +/// future. +/// +/// Every expression the kernel visits belongs to some list of "sibling" elements. The schema +/// itself is a list of schema elements, and every complex type (struct expression, array, variadic, etc) +/// contains a list of "child" elements. +/// 1. Before visiting any complex expression type, the kernel asks the engine to allocate a list to +/// hold its children +/// 2. When visiting any expression element, the kernel passes its parent's "child list" as the +/// "sibling list" the element should be appended to: +/// - For a struct literal, first visit each struct field and visit each value +/// - For a struct expression, visit each sub expression. +/// - For an array literal, visit each of the elements. +/// - For a variadic `and` or `or` expression, visit each sub-expression. +/// - For a binary operator expression, visit the left and right operands. +/// - For a unary `is null` or `not` expression, visit the sub-expression. +/// 3. When visiting a complex expression, the kernel also passes the "child list" containing +/// that element's (already-visited) children. +/// 4. The [`visit_expression`] method returns the id of the list of top-level columns +/// +/// WARNING: The visitor MUST NOT retain internal references to string slices or binary data passed +/// to visitor methods +/// TODO: Visit type information in struct field and null. This will likely involve using the schema +/// visitor. Note that struct literals are currently in flux, and may change significantly. Here is the relevant +/// issue: https://github.com/delta-incubator/delta-kernel-rs/issues/412 +#[repr(C)] +pub struct EngineExpressionVisitor { + /// An opaque engine state pointer + pub data: *mut c_void, + /// Creates a new expression list, optionally reserving capacity up front + pub make_field_list: extern "C" fn(data: *mut c_void, reserve: usize) -> usize, + /// Visit a 32bit `integer` belonging to the list identified by `sibling_list_id`. + pub visit_literal_int: VisitLiteralFn, + /// Visit a 64bit `long` belonging to the list identified by `sibling_list_id`. + pub visit_literal_long: VisitLiteralFn, + /// Visit a 16bit `short` belonging to the list identified by `sibling_list_id`. + pub visit_literal_short: VisitLiteralFn, + /// Visit an 8bit `byte` belonging to the list identified by `sibling_list_id`. + pub visit_literal_byte: VisitLiteralFn, + /// Visit a 32bit `float` belonging to the list identified by `sibling_list_id`. + pub visit_literal_float: VisitLiteralFn, + /// Visit a 64bit `double` belonging to the list identified by `sibling_list_id`. + pub visit_literal_double: VisitLiteralFn, + /// Visit a `string` belonging to the list identified by `sibling_list_id`. + pub visit_literal_string: VisitLiteralFn, + /// Visit a `boolean` belonging to the list identified by `sibling_list_id`. + pub visit_literal_bool: VisitLiteralFn, + /// Visit a 64bit timestamp belonging to the list identified by `sibling_list_id`. + /// The timestamp is microsecond precision and adjusted to UTC. + pub visit_literal_timestamp: VisitLiteralFn, + /// Visit a 64bit timestamp belonging to the list identified by `sibling_list_id`. + /// The timestamp is microsecond precision with no timezone. + pub visit_literal_timestamp_ntz: VisitLiteralFn, + /// Visit a 32bit intger `date` representing days since UNIX epoch 1970-01-01. The `date` belongs + /// to the list identified by `sibling_list_id`. + pub visit_literal_date: VisitLiteralFn, + /// Visit binary data at the `buffer` with length `len` belonging to the list identified by + /// `sibling_list_id`. + pub visit_literal_binary: + extern "C" fn(data: *mut c_void, sibling_list_id: usize, buffer: *const u8, len: usize), + /// Visit a 128bit `decimal` value with the given precision and scale. The 128bit integer + /// is split into the most significant 64 bits in `value_ms`, and the least significant 64 + /// bits in `value_ls`. The `decimal` belongs to the list identified by `sibling_list_id`. + pub visit_literal_decimal: extern "C" fn( + data: *mut c_void, + sibling_list_id: usize, + value_ms: u64, + value_ls: u64, + precision: u8, + scale: u8, + ), + /// Visit a struct literal belonging to the list identified by `sibling_list_id`. + /// The field names of the struct are in a list identified by `child_field_list_id`. + /// The values of the struct are in a list identified by `child_value_list_id`. + pub visit_literal_struct: extern "C" fn( + data: *mut c_void, + sibling_list_id: usize, + child_field_list_value: usize, + child_value_list_id: usize, + ), + /// Visit an array literal belonging to the list identified by `sibling_list_id`. + /// The values of the array are in a list identified by `child_list_id`. + pub visit_literal_array: + extern "C" fn(data: *mut c_void, sibling_list_id: usize, child_list_id: usize), + /// Visits a null value belonging to the list identified by `sibling_list_id. + pub visit_literal_null: extern "C" fn(data: *mut c_void, sibling_list_id: usize), + /// Visits an `and` expression belonging to the list identified by `sibling_list_id`. + /// The sub-expressions of the array are in a list identified by `child_list_id` + pub visit_and: VisitVariadicFn, + /// Visits an `or` expression belonging to the list identified by `sibling_list_id`. + /// The sub-expressions of the array are in a list identified by `child_list_id` + pub visit_or: VisitVariadicFn, + /// Visits a `not` expression belonging to the list identified by `sibling_list_id`. + /// The sub-expression will be in a _one_ item list identified by `child_list_id` + pub visit_not: VisitUnaryFn, + /// Visits a `is_null` expression belonging to the list identified by `sibling_list_id`. + /// The sub-expression will be in a _one_ item list identified by `child_list_id` + pub visit_is_null: VisitUnaryFn, + /// Visits the `LessThan` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_lt: VisitBinaryOpFn, + /// Visits the `LessThanOrEqual` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_le: VisitBinaryOpFn, + /// Visits the `GreaterThan` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_gt: VisitBinaryOpFn, + /// Visits the `GreaterThanOrEqual` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_ge: VisitBinaryOpFn, + /// Visits the `Equal` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_eq: VisitBinaryOpFn, + /// Visits the `NotEqual` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_ne: VisitBinaryOpFn, + /// Visits the `Distinct` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_distinct: VisitBinaryOpFn, + /// Visits the `In` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_in: VisitBinaryOpFn, + /// Visits the `NotIn` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_not_in: VisitBinaryOpFn, + /// Visits the `Add` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_add: VisitBinaryOpFn, + /// Visits the `Minus` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_minus: VisitBinaryOpFn, + /// Visits the `Multiply` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_multiply: VisitBinaryOpFn, + /// Visits the `Divide` binary operator belonging to the list identified by `sibling_list_id`. + /// The operands will be in a _two_ item list identified by `child_list_id` + pub visit_divide: VisitBinaryOpFn, + /// Visits the `column` belonging to the list identified by `sibling_list_id`. + pub visit_column: + extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice), + /// Visits a `StructExpression` belonging to the list identified by `sibling_list_id`. + /// The sub-expressions of the `StructExpression` are in a list identified by `child_list_id` + pub visit_struct_expr: + extern "C" fn(data: *mut c_void, sibling_list_id: usize, child_list_id: usize), +} + +/// Visit the expression of the passed [`SharedExpression`] Handle using the provided `visitor`. +/// See the documentation of [`EngineExpressionVisitor`] for a description of how this visitor +/// works. +/// +/// This method returns the id that the engine generated for the top level expression +/// +/// # Safety +/// +/// The caller must pass a valid SharedExpression Handle and expression visitor +#[no_mangle] +pub unsafe extern "C" fn visit_expression( + expression: &Handle, + visitor: &mut EngineExpressionVisitor, +) -> usize { + macro_rules! call { + ( $visitor:ident, $visitor_fn:ident $(, $extra_args:expr) *) => { + ($visitor.$visitor_fn)($visitor.data $(, $extra_args) *) + }; + } + fn visit_expression_array( + visitor: &mut EngineExpressionVisitor, + array: &ArrayData, + sibling_list_id: usize, + ) { + #[allow(deprecated)] + let elements = array.array_elements(); + let child_list_id = call!(visitor, make_field_list, elements.len()); + for scalar in elements { + visit_expression_scalar(visitor, scalar, child_list_id); + } + call!(visitor, visit_literal_array, sibling_list_id, child_list_id); + } + fn visit_expression_struct_literal( + visitor: &mut EngineExpressionVisitor, + struct_data: &StructData, + sibling_list_id: usize, + ) { + let child_value_list_id = call!(visitor, make_field_list, struct_data.fields().len()); + let child_field_list_id = call!(visitor, make_field_list, struct_data.fields().len()); + for (field, value) in struct_data.fields().iter().zip(struct_data.values()) { + call!( + visitor, + visit_literal_string, + child_field_list_id, + field.name().into() + ); + visit_expression_scalar(visitor, value, child_value_list_id); + } + call!( + visitor, + visit_literal_struct, + sibling_list_id, + child_field_list_id, + child_value_list_id + ) + } + fn visit_expression_struct_expr( + visitor: &mut EngineExpressionVisitor, + exprs: &[Expression], + sibling_list_id: usize, + ) { + let child_list_id = call!(visitor, make_field_list, exprs.len()); + for expr in exprs { + visit_expression_impl(visitor, expr, child_list_id); + } + call!(visitor, visit_struct_expr, sibling_list_id, child_list_id) + } + fn visit_expression_variadic( + visitor: &mut EngineExpressionVisitor, + op: &VariadicOperator, + exprs: &[Expression], + sibling_list_id: usize, + ) { + let child_list_id = call!(visitor, make_field_list, exprs.len()); + for expr in exprs { + visit_expression_impl(visitor, expr, child_list_id); + } + + let visit_fn = match op { + VariadicOperator::And => &visitor.visit_and, + VariadicOperator::Or => &visitor.visit_or, + }; + visit_fn(visitor.data, sibling_list_id, child_list_id); + } + fn visit_expression_scalar( + visitor: &mut EngineExpressionVisitor, + scalar: &Scalar, + sibling_list_id: usize, + ) { + match scalar { + Scalar::Integer(val) => call!(visitor, visit_literal_int, sibling_list_id, *val), + Scalar::Long(val) => call!(visitor, visit_literal_long, sibling_list_id, *val), + Scalar::Short(val) => call!(visitor, visit_literal_short, sibling_list_id, *val), + Scalar::Byte(val) => call!(visitor, visit_literal_byte, sibling_list_id, *val), + Scalar::Float(val) => call!(visitor, visit_literal_float, sibling_list_id, *val), + Scalar::Double(val) => { + call!(visitor, visit_literal_double, sibling_list_id, *val) + } + Scalar::String(val) => { + call!(visitor, visit_literal_string, sibling_list_id, val.into()) + } + Scalar::Boolean(val) => call!(visitor, visit_literal_bool, sibling_list_id, *val), + Scalar::Timestamp(val) => { + call!(visitor, visit_literal_timestamp, sibling_list_id, *val) + } + Scalar::TimestampNtz(val) => { + call!(visitor, visit_literal_timestamp_ntz, sibling_list_id, *val) + } + Scalar::Date(val) => call!(visitor, visit_literal_date, sibling_list_id, *val), + Scalar::Binary(buf) => call!( + visitor, + visit_literal_binary, + sibling_list_id, + buf.as_ptr(), + buf.len() + ), + Scalar::Decimal(value, precision, scale) => { + let ms: u64 = (value >> 64) as u64; + let ls: u64 = *value as u64; + call!( + visitor, + visit_literal_decimal, + sibling_list_id, + ms, + ls, + *precision, + *scale + ) + } + Scalar::Null(_) => call!(visitor, visit_literal_null, sibling_list_id), + Scalar::Struct(struct_data) => { + visit_expression_struct_literal(visitor, struct_data, sibling_list_id) + } + Scalar::Array(array) => visit_expression_array(visitor, array, sibling_list_id), + } + } + fn visit_expression_impl( + visitor: &mut EngineExpressionVisitor, + expression: &Expression, + sibling_list_id: usize, + ) { + match expression { + Expression::Literal(scalar) => { + visit_expression_scalar(visitor, scalar, sibling_list_id) + } + Expression::Column(name) => { + call!(visitor, visit_column, sibling_list_id, name.as_str().into()) + } + Expression::Struct(exprs) => { + visit_expression_struct_expr(visitor, exprs, sibling_list_id) + } + Expression::BinaryOperation { op, left, right } => { + let child_list_id = call!(visitor, make_field_list, 2); + visit_expression_impl(visitor, left, child_list_id); + visit_expression_impl(visitor, right, child_list_id); + let op = match op { + BinaryOperator::Plus => visitor.visit_add, + BinaryOperator::Minus => visitor.visit_minus, + BinaryOperator::Multiply => visitor.visit_multiply, + BinaryOperator::Divide => visitor.visit_divide, + BinaryOperator::LessThan => visitor.visit_lt, + BinaryOperator::LessThanOrEqual => visitor.visit_le, + BinaryOperator::GreaterThan => visitor.visit_gt, + BinaryOperator::GreaterThanOrEqual => visitor.visit_ge, + BinaryOperator::Equal => visitor.visit_eq, + BinaryOperator::NotEqual => visitor.visit_ne, + BinaryOperator::Distinct => visitor.visit_distinct, + BinaryOperator::In => visitor.visit_in, + BinaryOperator::NotIn => visitor.visit_not_in, + }; + op(visitor.data, sibling_list_id, child_list_id); + } + Expression::UnaryOperation { op, expr } => { + let child_id_list = call!(visitor, make_field_list, 1); + visit_expression_impl(visitor, expr, child_id_list); + let op = match op { + UnaryOperator::Not => visitor.visit_not, + UnaryOperator::IsNull => visitor.visit_is_null, + }; + op(visitor.data, sibling_list_id, child_id_list); + } + Expression::VariadicOperation { op, exprs } => { + visit_expression_variadic(visitor, op, exprs, sibling_list_id) + } + } + } + let top_level = call!(visitor, make_field_list, 1); + visit_expression_impl(visitor, expression.as_ref(), top_level); + top_level +} diff --git a/ffi/src/expressions/mod.rs b/ffi/src/expressions/mod.rs new file mode 100644 index 000000000..a6756f972 --- /dev/null +++ b/ffi/src/expressions/mod.rs @@ -0,0 +1,10 @@ +//! This module holds functionality for moving expressions across the FFI boundary, both from +//! engine to kernel, and from kernel to engine. +use delta_kernel::Expression; +use delta_kernel_ffi_macros::handle_descriptor; + +pub mod engine; +pub mod kernel; + +#[handle_descriptor(target=Expression, mutable=false, sized=true)] +pub struct SharedExpression; diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index ff3d60f2e..290c3a09d 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -33,6 +33,8 @@ pub mod engine_funcs; pub mod expressions; pub mod scan; pub mod schema; +#[cfg(feature = "test-ffi")] +pub mod test_ffi; pub(crate) type NullableCvoid = Option>; diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs index 8687c9a3a..d388a6e0c 100644 --- a/ffi/src/scan.rs +++ b/ffi/src/scan.rs @@ -13,7 +13,9 @@ use delta_kernel_ffi_macros::handle_descriptor; use tracing::debug; use url::Url; -use crate::expressions::{unwrap_kernel_expression, EnginePredicate, KernelExpressionVisitorState}; +use crate::expressions::engine::{ + unwrap_kernel_expression, EnginePredicate, KernelExpressionVisitorState, +}; use crate::{ AllocateStringFn, ExclusiveEngineData, ExternEngine, ExternResult, IntoExternResult, KernelBoolSlice, KernelRowIndexArray, KernelStringSlice, NullableCvoid, SharedExternEngine, diff --git a/ffi/src/test_ffi.rs b/ffi/src/test_ffi.rs new file mode 100644 index 000000000..fa7e2ef05 --- /dev/null +++ b/ffi/src/test_ffi.rs @@ -0,0 +1,95 @@ +//! Utility functions used for testing ffi code + +use std::{ops::Not, sync::Arc}; + +use crate::{expressions::SharedExpression, handle::Handle}; +use delta_kernel::{ + expressions::{column_expr, ArrayData, BinaryOperator, Expression, Scalar, StructData}, + schema::{ArrayType, DataType, StructField, StructType}, +}; + +/// Constructs a kernel expression that is passed back as a SharedExpression handle. The expected +/// output expression can be found in `ffi/tests/test_expression_visitor/expected.txt`. +/// +/// # Safety +/// The caller is responsible for freeing the retured memory, either by calling +/// [`free_kernel_predicate`], or [`Handle::drop_handle`] +#[no_mangle] +pub unsafe extern "C" fn get_testing_kernel_expression() -> Handle { + use Expression as Expr; + + let array_type = ArrayType::new( + DataType::Primitive(delta_kernel::schema::PrimitiveType::Short), + false, + ); + let array_data = ArrayData::new(array_type.clone(), vec![Scalar::Short(5), Scalar::Short(0)]); + + let nested_fields = vec![ + StructField::new("a", DataType::INTEGER, false), + StructField::new("b", array_type, false), + ]; + let nested_values = vec![Scalar::Integer(500), Scalar::Array(array_data.clone())]; + let nested_struct = StructData::try_new(nested_fields.clone(), nested_values).unwrap(); + let nested_struct_type = StructType::new(nested_fields); + + let top_level_struct = StructData::try_new( + vec![StructField::new( + "top", + DataType::Struct(Box::new(nested_struct_type)), + true, + )], + vec![Scalar::Struct(nested_struct)], + ) + .unwrap(); + + let mut sub_exprs = vec![ + Expr::literal(i8::MAX), + Expr::literal(i8::MIN), + Expr::literal(f32::MAX), + Expr::literal(f32::MIN), + Expr::literal(f64::MAX), + Expr::literal(f64::MIN), + Expr::literal(i32::MAX), + Expr::literal(i32::MIN), + Expr::literal(i64::MAX), + Expr::literal(i64::MIN), + Expr::literal("hello expressions"), + Expr::literal(true), + Expr::literal(false), + Scalar::Timestamp(50).into(), + Scalar::TimestampNtz(100).into(), + Scalar::Date(32).into(), + Scalar::Binary(0x0000deadbeefcafeu64.to_be_bytes().to_vec()).into(), + // Both the most and least significant u64 of the Decimal value will be 1 + Scalar::Decimal((1 << 64) + 1, 2, 3).into(), + Expr::null_literal(DataType::SHORT), + Scalar::Struct(top_level_struct).into(), + Scalar::Array(array_data).into(), + Expr::struct_from(vec![Expr::or_from(vec![ + Scalar::Integer(5).into(), + Scalar::Long(20).into(), + ])]), + Expr::not(Expr::is_null(column_expr!("col"))), + ]; + sub_exprs.extend( + [ + BinaryOperator::In, + BinaryOperator::Plus, + BinaryOperator::Minus, + BinaryOperator::Equal, + BinaryOperator::NotEqual, + BinaryOperator::NotIn, + BinaryOperator::Divide, + BinaryOperator::Multiply, + BinaryOperator::LessThan, + BinaryOperator::LessThanOrEqual, + BinaryOperator::GreaterThan, + BinaryOperator::GreaterThanOrEqual, + BinaryOperator::Distinct, + ] + .iter() + .map(|op| Expr::binary(*op, Scalar::Integer(0), Scalar::Long(0))), + ); + + Arc::new(Expr::and_from(sub_exprs)).into() +} diff --git a/ffi/tests/test-expression-visitor/expected.txt b/ffi/tests/test-expression-visitor/expected.txt new file mode 100644 index 000000000..4321df93d --- /dev/null +++ b/ffi/tests/test-expression-visitor/expected.txt @@ -0,0 +1,78 @@ +And + Byte(127) + Byte(-128) + Float(340282346638528859811704183484516925440.000000) + Float(-340282346638528859811704183484516925440.000000) + Double(179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000) + Double(-179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000) + Integer(2147483647) + Integer(-2147483648) + Long(9223372036854775807) + Long(-9223372036854775808) + String(hello expressions) + Boolean(1) + Boolean(0) + Timestamp(50) + TimestampNtz(100) + Date(32) + Binary(0000deadbeefcafe) + Decimal(1,1, 3, 2) + Null + Struct + Field: top + Struct + Field: a + Integer(500) + Field: b + Array + Short(5) + Short(0) + Array + Short(5) + Short(0) + StructExpression + Or + Integer(5) + Long(20) + Not + IsNull + Column(col) + In + Integer(0) + Long(0) + Add + Integer(0) + Long(0) + Minus + Integer(0) + Long(0) + Equal + Integer(0) + Long(0) + NotEqual + Integer(0) + Long(0) + NotIn + Integer(0) + Long(0) + Divide + Integer(0) + Long(0) + Multiply + Integer(0) + Long(0) + LessThan + Integer(0) + Long(0) + LessThanOrEqual + Integer(0) + Long(0) + GreaterThan + Integer(0) + Long(0) + GreaterThanOrEqual + Integer(0) + Long(0) + Distinct + Integer(0) + Long(0) diff --git a/ffi/tests/test-expression-visitor/run_test.sh b/ffi/tests/test-expression-visitor/run_test.sh new file mode 100755 index 000000000..2c00c9ea4 --- /dev/null +++ b/ffi/tests/test-expression-visitor/run_test.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -euxo pipefail + +OUT_FILE=$(mktemp) +./visit_expression | tee "$OUT_FILE" +diff -s "$OUT_FILE" "$1" +DIFF_EXIT_CODE=$? +echo "Diff exited with $DIFF_EXIT_CODE" +rm "$OUT_FILE" +exit "$DIFF_EXIT_CODE" + diff --git a/kernel/src/engine/parquet_stats_skipping/tests.rs b/kernel/src/engine/parquet_stats_skipping/tests.rs index b4bdd97d3..a723c4600 100644 --- a/kernel/src/engine/parquet_stats_skipping/tests.rs +++ b/kernel/src/engine/parquet_stats_skipping/tests.rs @@ -164,7 +164,7 @@ fn test_binary_scalars() { Struct(StructData::try_new(vec![], vec![]).unwrap()), Array(ArrayData::new( ArrayType::new(DataType::LONG, false), - vec![], + Vec::::new(), )), ]; let larger_values = &[ @@ -185,7 +185,7 @@ fn test_binary_scalars() { Struct(StructData::try_new(vec![], vec![]).unwrap()), Array(ArrayData::new( ArrayType::new(DataType::LONG, false), - vec![], + Vec::::new(), )), ]; diff --git a/kernel/src/expressions/column_names.rs b/kernel/src/expressions/column_names.rs index 4129abb30..ca0e2ca70 100644 --- a/kernel/src/expressions/column_names.rs +++ b/kernel/src/expressions/column_names.rs @@ -135,7 +135,7 @@ impl Hash for ColumnName { #[doc(hidden)] macro_rules! __column_name { ( $($name:tt)* ) => { - $crate::expressions::ColumnName::new(delta_kernel_derive::parse_column_name!($($name)*)) + $crate::expressions::ColumnName::new($crate::delta_kernel_derive::parse_column_name!($($name)*)) }; } #[doc(inline)] diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 3578258e9..bbbabe17e 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -20,8 +20,9 @@ pub struct ArrayData { } impl ArrayData { - #[cfg(test)] - pub(crate) fn new(tpe: ArrayType, elements: Vec) -> Self { + #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] + pub fn new(tpe: ArrayType, elements: impl IntoIterator>) -> Self { + let elements = elements.into_iter().map(Into::into).collect(); Self { tpe, elements } } pub fn array_type(&self) -> &ArrayType { diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 6d47d9ae2..2f686a3ad 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -76,6 +76,7 @@ pub mod table; pub mod transaction; pub(crate) mod utils; +pub use delta_kernel_derive; pub use engine_data::{DataVisitor, EngineData}; pub use error::{DeltaResult, Error}; pub use expressions::{Expression, ExpressionRef};