Skip to content

Vector index corruption on row delete with duplicates #2197

@MatthewSteel

Description

@MatthewSteel

Issue observed in python with library libsql 0.1.11.

If a table contains an indexed vector column, and two rows in that table have identical vector values, deleting one of the rows can result in both being removed from the vector index. The remaining row will appear in a regular table scan, but is unretrievable with vector_top_k -- vector_top_k will return every row in the table except the duplicate. Other rows never seem to go missing. Reindexing fixes the issue but is very slow.

Without the delete both rows are reliably returned by vector_top_k, it doesn't seem like an insert doing an overwrite, more like a too-broad delete.

I believe the behavior is data-dependent -- in my testing, with a 768-dim vector and 20 randomly-generated unit vector rows this happens reliably, but for smaller dimensions and fewer rows it doesn't always happen.

I have this python script that runs various configurations, mostly failing. I would guess that the right way to solve it is,

  • Use random seeds for reproducibility,
  • Find a small case (3 rows, 2 dimensions?) that passes, another that fails,
  • Look at the index structure and see what it's doing, why one works and the other doesn't.
#!/usr/bin/env python3
"""Reproduce libsql vector index bug with duplicate embeddings.

Bug: When two rows have identical embeddings and you delete one,
the other row becomes unsearchable via vector_top_k.
"""

import os
import random
import tempfile

import libsql


def generate_random_embedding(dim: int) -> list[float]:
    """Generate a random normalized embedding vector."""
    vec = [random.gauss(0, 1) for _ in range(dim)]
    magnitude = sum(x * x for x in vec) ** 0.5
    return [x / magnitude for x in vec]


def test_with_dim(dim: int, n_existing: int = 5) -> bool:
    """Test if bug reproduces with given embedding dimension. Returns True if bug found."""
    with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
        db_path = tmp.name

    try:
        # Setup table and index
        conn = libsql.connect(db_path)
        conn.execute(f"""
            CREATE TABLE test_vectors (
                id INTEGER PRIMARY KEY,
                text_embedding F32_BLOB({dim}) NOT NULL
            )
        """)
        conn.execute("""
            CREATE INDEX test_vectors_idx ON test_vectors (libsql_vector_idx(text_embedding))
        """)
        conn.commit()
        conn.close()

        # Insert n_existing rows with random embeddings
        for i in range(1, n_existing + 1):
            conn = libsql.connect(db_path)
            vec = generate_random_embedding(dim)
            conn.execute(
                f"INSERT INTO test_vectors (id, text_embedding) VALUES ({i}, vector(?))",
                (str(vec),)
            )
            conn.commit()
            conn.close()

        # Generate target embedding and insert row with id=100
        target_embedding = generate_random_embedding(dim)
        conn = libsql.connect(db_path)
        conn.execute(
            "INSERT INTO test_vectors (id, text_embedding) VALUES (100, vector(?))",
            (str(target_embedding),)
        )
        conn.commit()
        conn.close()

        # Insert duplicate with id=101 (same embedding as id=100)
        conn = libsql.connect(db_path)
        conn.execute(
            "INSERT INTO test_vectors (id, text_embedding) VALUES (101, vector(?))",
            (str(target_embedding),)
        )
        conn.commit()
        conn.close()

        # Delete original (id=100)
        conn = libsql.connect(db_path)
        conn.execute("DELETE FROM test_vectors WHERE id = 100")
        conn.commit()
        conn.close()

        # Search for the embedding - should find id=101
        conn = libsql.connect(db_path)
        result = conn.execute(
            """
            SELECT t.id
            FROM vector_top_k('test_vectors_idx', ?, ?) AS vtk
            JOIN test_vectors t ON t.rowid = vtk.id
            """,
            (str(target_embedding), n_existing + 10)
        )
        rows = result.fetchall()
        conn.close()

        # Bug exists if id=101 is not in results
        return not any(r[0] == 101 for r in rows)

    finally:
        if os.path.exists(db_path):
            os.remove(db_path)


def main():
    print("Testing libsql vector index bug with duplicate embeddings")
    print("Fixed: 5 existing rows in index\n")
    print("Testing with different embedding dimensions:")
    print("(Bug: after deleting duplicate, remaining row becomes unsearchable)\n")

    for dim in [2, 3, 5, 10, 100, 768]:
        bug_found = test_with_dim(dim, n_existing=5)
        status = "❌ BUG" if bug_found else "✓ OK"
        print(f"  {dim:3d} dimensions: {status}")

    print("\nNote: Bug is nondeterministic but reproduces frequently.")
    print("Testing 768-dim case 5 times to show consistency:\n")
    
    bug_count = 0
    for i in range(5):
        bug_found = test_with_dim(768, n_existing=20)
        status = "❌ BUG" if bug_found else "✓ OK"
        print(f"  Trial {i+1}: {status}")
        if bug_found:
            bug_count += 1
    
    print(f"\n768-dim bug reproduced in {bug_count}/5 trials ({bug_count*20}%)")


if __name__ == "__main__":
    main()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions