risinglightdb · Feb 14, 2025
diff --git a/‎src/binder/create_index.rs
Lines changed: 109 additions & 0 deletions b/‎src/binder/create_index.rs
Lines changed: 109 additions & 0 deletions
diff --git a/‎src/binder/mod.rs
Lines changed: 1 addition & 1 deletion b/‎src/binder/mod.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/catalog/index.rs
Lines changed: 14 additions & 1 deletion b/‎src/catalog/index.rs
Lines changed: 14 additions & 1 deletion
diff --git a/‎src/catalog/root.rs
Lines changed: 3 additions & 1 deletion b/‎src/catalog/root.rs
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/catalog/schema.rs
Lines changed: 9 additions & 1 deletion b/‎src/catalog/schema.rs
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/executor/create_index.rs
Lines changed: 1 addition & 0 deletions b/‎src/executor/create_index.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/planner/cost.rs
Lines changed: 1 addition & 1 deletion b/‎src/planner/cost.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/planner/explain.rs
Lines changed: 10 additions & 0 deletions b/‎src/planner/explain.rs
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/planner/mod.rs
Lines changed: 1 addition & 0 deletions b/‎src/planner/mod.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/planner/optimizer.rs
Lines changed: 2 additions & 1 deletion b/‎src/planner/optimizer.rs
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/planner/rules/plan.rs
Lines changed: 76 additions & 0 deletions b/‎src/planner/rules/plan.rs
Lines changed: 76 additions & 0 deletions
diff --git a/‎src/storage/memory/mod.rs
Lines changed: 9 additions & 1 deletion b/‎src/storage/memory/mod.rs
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/storage/mod.rs
Lines changed: 2 additions & 0 deletions b/‎src/storage/mod.rs
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/storage/secondary/mod.rs
Lines changed: 9 additions & 1 deletion b/‎src/storage/secondary/mod.rs
Lines changed: 9 additions & 1 deletion
diff --git a/‎tests/planner_test/vector.planner.sql
Lines changed: 15 additions & 0 deletions b/‎tests/planner_test/vector.planner.sql
Lines changed: 15 additions & 0 deletions
diff --git a/‎tests/planner_test/vector.yml
Lines changed: 18 additions & 0 deletions b/‎tests/planner_test/vector.yml
Lines changed: 18 additions & 0 deletions
diff --git a/‎tests/sql/catalog.slt
Lines changed: 1 addition & 1 deletion b/‎tests/sql/catalog.slt
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/sql/vector_index.slt
Lines changed: 18 additions & 0 deletions b/‎tests/sql/vector_index.slt
Lines changed: 18 additions & 0 deletions
@@ -10,12 +10,44 @@ use serde::{Deserialize, Serialize};
 use super::*;
 use crate::catalog::{ColumnId, SchemaId, TableId};
 
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Serialize, Deserialize)]
+pub enum VectorDistance {
+    Cosine,
+    L2,
+    NegativeDotProduct,
+}
+
+impl FromStr for VectorDistance {
+    type Err = String;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        match s {
+            "cosine" | "<=>" => Ok(VectorDistance::Cosine),
+            "l2" | "<->" => Ok(VectorDistance::L2),
+            "dotproduct" | "<#>" => Ok(VectorDistance::NegativeDotProduct),
+            _ => Err(format!("invalid vector distance: {}", s)),
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Serialize, Deserialize)]
+pub enum IndexType {
+    Hnsw,
+    IvfFlat {
+        distance: VectorDistance,
+        nlists: usize,
+        nprobe: usize,
+    },
+    Btree,
+}
+
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Serialize, Deserialize)]
 pub struct CreateIndex {
     pub schema_id: SchemaId,
     pub index_name: String,
     pub table_id: TableId,
     pub columns: Vec<ColumnId>,
+    pub index_type: IndexType,
 }
 
 impl fmt::Display for CreateIndex {
@@ -48,6 +80,80 @@ impl FromStr for Box<CreateIndex> {
 }
 
 impl Binder {
+    fn parse_index_type(&self, using: Option<Ident>, with: Vec<Expr>) -> Result<IndexType> {
+        let Some(using) = using else {
+            return Err(ErrorKind::InvalidIndex("using clause is required".to_string()).into());
+        };
+        match using.to_string().to_lowercase().as_str() {
+            "btree" => Ok(IndexType::Btree),
+            "hnsw" => Ok(IndexType::Hnsw),
+            "ivfflat" => {
+                let mut distfn = None;
+                let mut nlists = None;
+                let mut nprobe = None;
+                for expr in with {
+                    let Expr::BinaryOp { left, op, right } = expr else {
+                        return Err(
+                            ErrorKind::InvalidIndex("invalid with clause".to_string()).into()
+                        );
+                    };
+                    if op != BinaryOperator::Eq {
+                        return Err(
+                            ErrorKind::InvalidIndex("invalid with clause".to_string()).into()
+                        );
+                    }
+                    let Expr::Identifier(Ident { value: key, .. }) = *left else {
+                        return Err(
+                            ErrorKind::InvalidIndex("invalid with clause".to_string()).into()
+                        );
+                    };
+                    let key = key.to_lowercase();
+                    let Expr::Value(v) = *right else {
+                        return Err(
+                            ErrorKind::InvalidIndex("invalid with clause".to_string()).into()
+                        );
+                    };
+                    let v: DataValue = v.into();
+                    match key.as_str() {
+                        "distfn" => {
+                            let v = v.as_str();
+                            distfn = Some(v.to_lowercase());
+                        }
+                        "nlists" => {
+                            let Some(v) = v.as_usize().unwrap() else {
+                                return Err(ErrorKind::InvalidIndex(
+                                    "invalid with clause".to_string(),
+                                )
+                                .into());
+                            };
+                            nlists = Some(v);
+                        }
+                        "nprobe" => {
+                            let Some(v) = v.as_usize().unwrap() else {
+                                return Err(ErrorKind::InvalidIndex(
+                                    "invalid with clause".to_string(),
+                                )
+                                .into());
+                            };
+                            nprobe = Some(v);
+                        }
+                        _ => {
+                            return Err(
+                                ErrorKind::InvalidIndex("invalid with clause".to_string()).into()
+                            );
+                        }
+                    }
+                }
+                Ok(IndexType::IvfFlat {
+                    distance: VectorDistance::from_str(distfn.unwrap().as_str()).unwrap(),
+                    nlists: nlists.unwrap(),
+                    nprobe: nprobe.unwrap(),
+                })
+            }
+            _ => Err(ErrorKind::InvalidIndex("invalid index type".to_string()).into()),
+        }
+    }
+
     pub(super) fn bind_create_index(&mut self, stat: crate::parser::CreateIndex) -> Result {
         let Some(ref name) = stat.name else {
             return Err(
@@ -57,6 +163,8 @@ impl Binder {
         let crate::parser::CreateIndex {
             table_name,
             columns,
+            using,
+            with,
             ..
         } = stat;
         let index_name = lower_case_name(name);
@@ -94,6 +202,7 @@ impl Binder {
             index_name: index_name.into(),
             table_id: table.id(),
             columns: column_ids,
+            index_type: self.parse_index_type(using, with)?,
         })));
         Ok(create)
     }
 
@@ -29,7 +29,7 @@ mod select;
 mod table;
 
 pub use self::create_function::CreateFunction;
-pub use self::create_index::CreateIndex;
+pub use self::create_index::{CreateIndex, IndexType, VectorDistance};
 pub use self::create_table::CreateTable;
 pub use self::error::BindError;
 use self::error::ErrorKind;
 
@@ -1,22 +1,31 @@
 // Copyright 2025 RisingLight Project Authors. Licensed under Apache-2.0.
 
 use super::*;
+use crate::binder::IndexType;
 
 /// The catalog of an index.
 pub struct IndexCatalog {
     id: IndexId,
     name: String,
     table_id: TableId,
     column_idxs: Vec<ColumnId>,
+    index_type: IndexType,
 }
 
 impl IndexCatalog {
-    pub fn new(id: IndexId, name: String, table_id: TableId, column_idxs: Vec<ColumnId>) -> Self {
+    pub fn new(
+        id: IndexId,
+        name: String,
+        table_id: TableId,
+        column_idxs: Vec<ColumnId>,
+        index_type: IndexType,
+    ) -> Self {
         Self {
             id,
             name,
             table_id,
             column_idxs,
+            index_type,
         }
     }
 
@@ -35,4 +44,8 @@ impl IndexCatalog {
     pub fn name(&self) -> &str {
         &self.name
     }
+
+    pub fn index_type(&self) -> IndexType {
+        self.index_type.clone()
+    }
 }
@@ -5,6 +5,7 @@ use std::sync::{Arc, Mutex};
 
 use super::function::FunctionCatalog;
 use super::*;
+use crate::binder::IndexType;
 use crate::parser;
 use crate::planner::RecExpr;
 
@@ -104,10 +105,11 @@ impl RootCatalog {
         index_name: String,
         table_id: TableId,
         column_idxs: &[ColumnId],
+        index_type: &IndexType,
     ) -> Result<IndexId, CatalogError> {
         let mut inner = self.inner.lock().unwrap();
         let schema = inner.schemas.get_mut(&schema_id).unwrap();
-        schema.add_index(index_name, table_id, column_idxs.to_vec())
+        schema.add_index(index_name, table_id, column_idxs.to_vec(), index_type)
     }
 
     pub fn get_index_on_table(&self, schema_id: SchemaId, table_id: TableId) -> Vec<IndexId> {
 
@@ -5,6 +5,7 @@ use std::sync::Arc;
 
 use super::function::FunctionCatalog;
 use super::*;
+use crate::binder::IndexType;
 use crate::planner::RecExpr;
 
 /// The catalog of a schema.
@@ -62,13 +63,20 @@ impl SchemaCatalog {
         name: String,
         table_id: TableId,
         columns: Vec<ColumnId>,
+        index_type: &IndexType,
     ) -> Result<IndexId, CatalogError> {
         if self.indexes_idxs.contains_key(&name) {
             return Err(CatalogError::Duplicated("index", name));
         }
         let index_id = self.next_id;
         self.next_id += 1;
-        let index_catalog = Arc::new(IndexCatalog::new(index_id, name.clone(), table_id, columns));
+        let index_catalog = Arc::new(IndexCatalog::new(
+            index_id,
+            name.clone(),
+            table_id,
+            columns,
+            index_type.clone(),
+        ));
         self.indexes_idxs.insert(name, index_id);
         self.indexes.insert(index_id, index_catalog);
         Ok(index_id)
 
@@ -21,6 +21,7 @@ impl<S: Storage> CreateIndexExecutor<S> {
                 &self.index.index_name,
                 self.index.table_id,
                 &self.index.columns,
+                &self.index.index_type,
             )
             .await?;
 
 
@@ -31,7 +31,7 @@ impl egg::CostFunction<Expr> for CostFn<'_> {
 
         let c = match enode {
             // plan nodes
-            Scan(_) | Values(_) => build(),
+            Scan(_) | Values(_) | IndexScan(_) => build(),
             Order([_, c]) => nlogn(rows(c)) + build() + costs(c),
             Filter([exprs, c]) => costs(exprs) * rows(c) + build() + costs(c),
             Proj([exprs, c]) | Window([exprs, c]) => costs(exprs) * rows(c) + costs(c),
 
@@ -248,6 +248,16 @@ impl<'a> Explain<'a> {
                     ("filter", self.expr(filter).pretty()),
                 ]),
             ),
+            IndexScan([table, columns, filter, key, vector]) => Pretty::childless_record(
+                "IndexScan",
+                with_meta(vec![
+                    ("table", self.expr(table).pretty()),
+                    ("columns", self.expr(columns).pretty()),
+                    ("filter", self.expr(filter).pretty()),
+                    ("key", self.expr(key).pretty()),
+                    ("vector", self.expr(vector).pretty()),
+                ]),
+            ),
             Values(values) => Pretty::simple_record(
                 "Values",
                 with_meta(vec![("rows", Pretty::display(&values.len()))]),
 
@@ -98,6 +98,7 @@ define_language! {
 
         // plans
         "scan" = Scan([Id; 3]),                 // (scan table [column..] filter)
+        "index_scan" = IndexScan([Id; 5]), // (index_scan table [column..] filter key value)
         "values" = Values(Box<[Id]>),           // (values [expr..]..)
         "proj" = Proj([Id; 2]),                 // (proj [expr..] child)
         "filter" = Filter([Id; 2]),             // (filter expr child)
 
@@ -121,13 +121,14 @@ static STAGE1_RULES: LazyLock<Vec<Rewrite>> = LazyLock::new(|| {
 });
 
 /// Stage2 rules in the optimizer.
-/// - pushdown predicate and projection
+/// - pushdown predicate, projection, and index scan
 static STAGE2_RULES: LazyLock<Vec<Rewrite>> = LazyLock::new(|| {
     let mut rules = vec![];
     rules.append(&mut rules::expr::rules());
     rules.append(&mut rules::plan::always_better_rules());
     rules.append(&mut rules::plan::predicate_pushdown_rules());
     rules.append(&mut rules::plan::projection_pushdown_rules());
+    rules.append(&mut rules::plan::index_scan_rules());
     rules
 });
 
 
@@ -6,7 +6,9 @@ use itertools::Itertools;
 
 use super::schema::schema_is_eq;
 use super::*;
+use crate::binder::{IndexType, VectorDistance};
 use crate::planner::ExprExt;
+use crate::types::DataValue;
 
 /// Returns the rules that always improve the plan.
 pub fn always_better_rules() -> Vec<Rewrite> {
@@ -398,6 +400,80 @@ pub fn projection_pushdown_rules() -> Vec<Rewrite> { vec![
     ),
 ]}
 
+/// Pushdown projections and prune unused columns.
+#[rustfmt::skip]
+pub fn index_scan_rules() -> Vec<Rewrite> { vec![
+    rw!("vector-index-scan-1";
+        "(order (list (<-> ?column ?vector)) (scan ?table ?columns ?filter))" =>
+        "(index_scan ?table ?columns ?filter ?column ?vector)"
+        if has_vector_index("?column", "<->", "?vector", "?filter")
+    ),
+    rw!("vector-index-scan-2";
+        "(order (list (<#> ?column ?vector)) (scan ?table ?columns ?filter))" =>
+        "(index_scan ?table ?columns ?filter ?column ?vector)"
+        if has_vector_index("?column", "<#>", "?vector", "?filter")
+    ),
+    rw!("vector-index-scan-3";
+        "(order (list (<=> ?column ?vector)) (scan ?table ?columns ?filter))" =>
+        "(index_scan ?table ?columns ?filter ?column ?vector)"
+        if has_vector_index("?column", "<=>", "?vector", "?filter")
+    ),
+]}
+
+/// Check if there is a vector index matching the statement. i.e.,
+/// `SELECT * FROM t ORDER BY v <-> constant_vector` will match the index
+/// on the table t with the vector column v and using the `<->` distance function.
+fn has_vector_index(
+    column: &str,
+    op: &str,
+    vector: &str,
+    filter: &str,
+) -> impl Fn(&mut EGraph, Id, &Subst) -> bool {
+    let column = var(column);
+    let vector = var(vector);
+    let filter = var(filter);
+    let op = op.to_string();
+    move |egraph, _, subst| {
+        let filter = &egraph[subst[filter]].data;
+        let vector = &egraph[subst[vector]].data;
+        let column = &egraph[subst[column]].data;
+        let Ok(vector_op) = op.parse::<VectorDistance>() else {
+            return false;
+        };
+        // Only support null filter or always true filter for now. Check if the filter is null or
+        // true.
+        if !matches!(filter.constant, Some(DataValue::Bool(true)) | None) {
+            return false;
+        }
+        if !matches!(vector.constant, Some(DataValue::Vector(_))) {
+            return false;
+        }
+        // Check if the order by statement is in the form of vector column <-> constant vector
+        if column.columns.len() != 1 {
+            return false;
+        }
+        let column = column.columns.iter().next().unwrap();
+        let Expr::Column(col) = column else {
+            return false;
+        };
+        let catalog = &egraph.analysis.catalog;
+        let indexes = catalog.get_index_on_table(col.schema_id, col.table_id);
+        for index_id in indexes {
+            // Check if any index matches the exact op and the column
+            let index = catalog.get_index_by_id(col.schema_id, index_id).unwrap();
+            if index.column_idxs() != [col.column_id] {
+                continue;
+            }
+            if let IndexType::IvfFlat { distance, .. } = index.index_type() {
+                if distance == vector_op {
+                    return true;
+                }
+            }
+        }
+        false
+    }
+}
+
 /// Returns true if the columns used in `expr` is disjoint from columns produced by `plan`.
 fn not_depend_on(expr: &str, plan: &str) -> impl Fn(&mut EGraph, Id, &Subst) -> bool {
     let expr = var(expr);
 
@@ -22,6 +22,7 @@ use std::sync::{Arc, Mutex};
 
 use super::index::InMemoryIndexes;
 use super::{InMemoryIndex, Storage, StorageError, StorageResult, TracedStorageError};
+use crate::binder::IndexType;
 use crate::catalog::{
     ColumnCatalog, ColumnId, IndexId, RootCatalog, RootCatalogRef, SchemaId, TableId, TableRefId,
 };
@@ -133,10 +134,17 @@ impl Storage for InMemoryStorage {
         index_name: &str,
         table_id: TableId,
         column_idxs: &[ColumnId],
+        index_type: &IndexType,
     ) -> StorageResult<IndexId> {
         let idx_id = self
             .catalog
-            .add_index(schema_id, index_name.to_string(), table_id, column_idxs)
+            .add_index(
+                schema_id,
+                index_name.to_string(),
+                table_id,
+                column_idxs,
+                index_type,
+            )
             .map_err(|_| StorageError::Duplicated("index", index_name.into()))?;
         self.indexes
             .lock()
 
@@ -24,6 +24,7 @@ pub use chunk::*;
 use enum_dispatch::enum_dispatch;
 
 use crate::array::{ArrayImpl, DataChunk};
+use crate::binder::IndexType;
 use crate::catalog::{
     ColumnCatalog, ColumnId, IndexId, RootCatalog, SchemaId, TableId, TableRefId,
 };
@@ -93,6 +94,7 @@ pub trait Storage: Sync + Send + 'static {
         index_name: &str,
         table_id: TableId,
         column_idxs: &[ColumnId],
+        index_type: &IndexType,
     ) -> impl Future<Output = StorageResult<IndexId>> + Send;
 
     /// Get the catalog of the storage engine.
 
@@ -34,6 +34,7 @@ use version_manager::*;
 
 use super::index::InMemoryIndexes;
 use super::{InMemoryIndex, Storage, StorageError, StorageResult, TracedStorageError};
+use crate::binder::IndexType;
 use crate::catalog::{
     ColumnCatalog, ColumnId, IndexId, RootCatalog, RootCatalogRef, SchemaId, TableId, TableRefId,
 };
@@ -200,10 +201,17 @@ impl Storage for SecondaryStorage {
         index_name: &str,
         table_id: TableId,
         column_idxs: &[ColumnId],
+        index_type: &IndexType,
     ) -> StorageResult<IndexId> {
         let idx_id = self
             .catalog
-            .add_index(schema_id, index_name.to_string(), table_id, column_idxs)
+            .add_index(
+                schema_id,
+                index_name.to_string(),
+                table_id,
+                column_idxs,
+                index_type,
+            )
             .map_err(|_| StorageError::Duplicated("index", index_name.into()))?;
         self.indexes
             .lock()
 
@@ -0,0 +1,15 @@
+-- match the index
+explain select * from t order by a <-> '[0, 0, 1]'::VECTOR(3);
+
+/*
+IndexScan { table: t, columns: [ a, b ], filter: true, key: a, vector: [0,0,1], cost: 0, rows: 1 }
+*/
+
+-- match the index
+explain select * from t order by a <=> '[0, 0, 1]'::VECTOR(3);
+
+/*
+Order { by: [ VectorCosineDistance { lhs: a, rhs: [0,0,1] } ], cost: 18, rows: 3 }
+└── Scan { table: t, list: [ a, b ], filter: true, cost: 6, rows: 3 }
+*/
+
@@ -0,0 +1,18 @@
+- sql: |
+    explain select * from t order by a <-> '[0, 0, 1]'::VECTOR(3);
+  desc: match the index
+  before:
+    - CREATE TABLE t (a vector(3) not null, b text not null);
+      INSERT INTO t VALUES ('[0, 0, 1]', 'a'), ('[0, 0, 2]', 'b'), ('[0, 0, 3]', 'c');
+      CREATE INDEX t_ivfflat ON t USING ivfflat (a) WITH (distfn = '<->', nlists = 3, nprobe = 2);
+  tasks:
+    - print
+- sql: |
+    explain select * from t order by a <=> '[0, 0, 1]'::VECTOR(3);
+  desc: match the index
+  before:
+    - CREATE TABLE t (a vector(3) not null, b text not null);
+      INSERT INTO t VALUES ('[0, 0, 1]', 'a'), ('[0, 0, 2]', 'b'), ('[0, 0, 3]', 'c');
+      CREATE INDEX t_ivfflat ON t USING ivfflat (a) WITH (distfn = '<->', nlists = 3, nprobe = 2);
+  tasks:
+    - print
@@ -12,7 +12,7 @@ query ITIT rowsort
 1 postgres 0 t
 
 statement ok
-create index i1 on t(v1)
+create index i1 on t using btree (v1)
 
 query ITITITT rowsort
 \di
 
@@ -0,0 +1,18 @@
+# vector_index
+statement ok
+create table t (a vector(3) not null, b text not null);
+
+statement ok
+insert into t values ('[-1, -2.0, -3]', 'a'), ('[1, 2.0, 3]', 'b');
+
+query RRR
+select * from t order by a <-> '[0, 0, 1]'::VECTOR(3);
+----
+[1,2,3] b
+[-1,-2,-3] a
+
+statement ok
+CREATE INDEX t_ivfflat ON t USING ivfflat (a) WITH (distfn = 'l2', nlists = 3, nprobe = 2);
+
+statement ok
+drop table t
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ impl<S: Storage> CreateIndexExecutor<S> {`
`21`	`21`	`&self.index.index_name,`
`22`	`22`	`self.index.table_id,`
`23`	`23`	`&self.index.columns,`
	`24`	`+ &self.index.index_type,`
`24`	`25`	`)`
`25`	`26`	`.await?;`
`26`	`27`