Skip to content

add scan tests with null values #1865

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions tests/io/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,12 @@
BoundNotStartsWith,
BoundReference,
BoundStartsWith,
EqualTo,
GreaterThan,
IsNull,
Not,
NotEqualTo,
NotNull,
Or,
)
from pyiceberg.expressions.literals import literal
Expand Down Expand Up @@ -2317,3 +2321,66 @@ def test_pyarrow_io_multi_fs() -> None:

# Same PyArrowFileIO instance resolves local file input to LocalFileSystem
assert isinstance(pyarrow_file_io.new_input("file:///path/to/file")._filesystem, LocalFileSystem)


def test_scan_nulls(catalog: InMemoryCatalog, arrow_table_with_null: pa.Table) -> None:
import pyarrow.compute as pc

catalog.create_namespace("default")
table = catalog.create_table(
"default.test_scan_nulls",
schema=arrow_table_with_null.schema,
)
table.append(arrow_table_with_null)

# "string": ["a", None, "z"]
assert len(table.scan(row_filter="string is null").to_arrow()) == 1
assert len(table.scan(row_filter=IsNull("string")).to_arrow()) == 1
assert len(table.scan().to_arrow().filter(pc.field("string").is_null())) == 1

assert len(table.scan(row_filter="string is not null").to_arrow()) == 2
assert len(table.scan(row_filter=NotNull("string")).to_arrow()) == 2
assert len(table.scan().to_arrow().filter(pc.field("string").is_valid())) == 2

assert len(table.scan(row_filter="string == 'a'").to_arrow()) == 1
assert len(table.scan(row_filter=EqualTo(term="string", literal=("a"))).to_arrow()) == 1
assert len(table.scan().to_arrow().filter(pc.field("string") == "a")) == 1

# this should be 2
assert len(table.scan(row_filter="string != 'a'").to_arrow()) == 1
assert len(table.scan(row_filter=NotEqualTo(term="string", literal=("a"))).to_arrow()) == 1
assert len(table.scan(row_filter=Not(EqualTo(term="string", literal=("a")))).to_arrow()) == 1
assert len(table.scan().to_arrow().filter(pc.field("string") != "a")) == 1
Comment on lines +2349 to +2353
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Fokko i think this might be a bug in how we handle nulls right now

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ugh, this gets really messy. for example, if string != 'a' should match null, what about ~(string != 'a')?

Copy link
Contributor

@Fokko Fokko Mar 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this looks incorrect to me. ~(string != 'a') should be rewritten to string = 'a', this happens in rewrite_not.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another options is:

table.scan().to_arrow().filter(pc.coalesce(pc.field("string") != "a", pc.scalar(False))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another option is to remove the not's in the beginning of the ScanPlan.



def test_scan_kleene(catalog: InMemoryCatalog, arrow_table_with_null: pa.Table) -> None:
catalog.create_namespace("default")
table = catalog.create_table(
"default.test_scan_nulls",
schema=arrow_table_with_null.schema,
)
table.append(arrow_table_with_null)

# "string": ["a", None, "z"]
assert len(table.scan(row_filter="string is null OR string = 'a'").to_arrow()) == 2 # {null, a}
assert len(table.scan(row_filter="string is null AND string = 'a'").to_arrow()) == 0 # {}
assert len(table.scan(row_filter="string is not null OR string = 'a'").to_arrow()) == 2 # {a, z}
assert len(table.scan(row_filter="string is not null AND string = 'a'").to_arrow()) == 1 # {a}


def test_scan_complements(catalog: InMemoryCatalog, arrow_table_with_null: pa.Table) -> None:
from pyiceberg.expressions.visitors import bind
from pyiceberg.io.pyarrow import _expression_to_complementary_pyarrow
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_expression_to_complementary_pyarrow explicitly calls out null handling

def _expression_to_complementary_pyarrow(expr: BooleanExpression) -> pc.Expression:
"""Complementary filter conversion function of expression_to_pyarrow.
Could not use expression_to_pyarrow(Not(expr)) to achieve this complementary effect because ~ in pyarrow.compute.Expression does not handle null.


catalog.create_namespace("default")
table = catalog.create_table(
"default.test_scan_complements",
schema=arrow_table_with_null.schema,
)
table.append(arrow_table_with_null)

string_equal = EqualTo(term="string", literal=("a"))
assert len(table.scan(row_filter=string_equal).to_arrow()) == 1
bound_string_equal = bind(table.schema(), string_equal, case_sensitive=False)
filter_expression = _expression_to_complementary_pyarrow(bound_string_equal)
assert len(table.scan().to_arrow().filter(filter_expression)) == 2 # complements handles null correctly