Skip to content

Commit f12ea23

Browse files
skyzhwangrunji0408
andauthoredFeb 14, 2025··
feat(planner): match vector indexes (#874)
This patch adds support for create index statements on vector columns and produces vector_index_scan plans. The executors are not implemented yet. --------- Signed-off-by: Alex Chi <iskyzh@gmail.com> Signed-off-by: Alex Chi Z. <4198311+skyzh@users.noreply.github.com> Co-authored-by: Runji Wang <wangrunji0408@163.com>
1 parent 1d646b8 commit f12ea23

File tree

18 files changed

+299
-9
lines changed

18 files changed

+299
-9
lines changed
 

‎src/binder/create_index.rs

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,44 @@ use serde::{Deserialize, Serialize};
1010
use super::*;
1111
use crate::catalog::{ColumnId, SchemaId, TableId};
1212

13+
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Serialize, Deserialize)]
14+
pub enum VectorDistance {
15+
Cosine,
16+
L2,
17+
NegativeDotProduct,
18+
}
19+
20+
impl FromStr for VectorDistance {
21+
type Err = String;
22+
23+
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
24+
match s {
25+
"cosine" | "<=>" => Ok(VectorDistance::Cosine),
26+
"l2" | "<->" => Ok(VectorDistance::L2),
27+
"dotproduct" | "<#>" => Ok(VectorDistance::NegativeDotProduct),
28+
_ => Err(format!("invalid vector distance: {}", s)),
29+
}
30+
}
31+
}
32+
33+
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Serialize, Deserialize)]
34+
pub enum IndexType {
35+
Hnsw,
36+
IvfFlat {
37+
distance: VectorDistance,
38+
nlists: usize,
39+
nprobe: usize,
40+
},
41+
Btree,
42+
}
43+
1344
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Serialize, Deserialize)]
1445
pub struct CreateIndex {
1546
pub schema_id: SchemaId,
1647
pub index_name: String,
1748
pub table_id: TableId,
1849
pub columns: Vec<ColumnId>,
50+
pub index_type: IndexType,
1951
}
2052

2153
impl fmt::Display for CreateIndex {
@@ -48,6 +80,80 @@ impl FromStr for Box<CreateIndex> {
4880
}
4981

5082
impl Binder {
83+
fn parse_index_type(&self, using: Option<Ident>, with: Vec<Expr>) -> Result<IndexType> {
84+
let Some(using) = using else {
85+
return Err(ErrorKind::InvalidIndex("using clause is required".to_string()).into());
86+
};
87+
match using.to_string().to_lowercase().as_str() {
88+
"btree" => Ok(IndexType::Btree),
89+
"hnsw" => Ok(IndexType::Hnsw),
90+
"ivfflat" => {
91+
let mut distfn = None;
92+
let mut nlists = None;
93+
let mut nprobe = None;
94+
for expr in with {
95+
let Expr::BinaryOp { left, op, right } = expr else {
96+
return Err(
97+
ErrorKind::InvalidIndex("invalid with clause".to_string()).into()
98+
);
99+
};
100+
if op != BinaryOperator::Eq {
101+
return Err(
102+
ErrorKind::InvalidIndex("invalid with clause".to_string()).into()
103+
);
104+
}
105+
let Expr::Identifier(Ident { value: key, .. }) = *left else {
106+
return Err(
107+
ErrorKind::InvalidIndex("invalid with clause".to_string()).into()
108+
);
109+
};
110+
let key = key.to_lowercase();
111+
let Expr::Value(v) = *right else {
112+
return Err(
113+
ErrorKind::InvalidIndex("invalid with clause".to_string()).into()
114+
);
115+
};
116+
let v: DataValue = v.into();
117+
match key.as_str() {
118+
"distfn" => {
119+
let v = v.as_str();
120+
distfn = Some(v.to_lowercase());
121+
}
122+
"nlists" => {
123+
let Some(v) = v.as_usize().unwrap() else {
124+
return Err(ErrorKind::InvalidIndex(
125+
"invalid with clause".to_string(),
126+
)
127+
.into());
128+
};
129+
nlists = Some(v);
130+
}
131+
"nprobe" => {
132+
let Some(v) = v.as_usize().unwrap() else {
133+
return Err(ErrorKind::InvalidIndex(
134+
"invalid with clause".to_string(),
135+
)
136+
.into());
137+
};
138+
nprobe = Some(v);
139+
}
140+
_ => {
141+
return Err(
142+
ErrorKind::InvalidIndex("invalid with clause".to_string()).into()
143+
);
144+
}
145+
}
146+
}
147+
Ok(IndexType::IvfFlat {
148+
distance: VectorDistance::from_str(distfn.unwrap().as_str()).unwrap(),
149+
nlists: nlists.unwrap(),
150+
nprobe: nprobe.unwrap(),
151+
})
152+
}
153+
_ => Err(ErrorKind::InvalidIndex("invalid index type".to_string()).into()),
154+
}
155+
}
156+
51157
pub(super) fn bind_create_index(&mut self, stat: crate::parser::CreateIndex) -> Result {
52158
let Some(ref name) = stat.name else {
53159
return Err(
@@ -57,6 +163,8 @@ impl Binder {
57163
let crate::parser::CreateIndex {
58164
table_name,
59165
columns,
166+
using,
167+
with,
60168
..
61169
} = stat;
62170
let index_name = lower_case_name(name);
@@ -94,6 +202,7 @@ impl Binder {
94202
index_name: index_name.into(),
95203
table_id: table.id(),
96204
columns: column_ids,
205+
index_type: self.parse_index_type(using, with)?,
97206
})));
98207
Ok(create)
99208
}

‎src/binder/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ mod select;
2929
mod table;
3030

3131
pub use self::create_function::CreateFunction;
32-
pub use self::create_index::CreateIndex;
32+
pub use self::create_index::{CreateIndex, IndexType, VectorDistance};
3333
pub use self::create_table::CreateTable;
3434
pub use self::error::BindError;
3535
use self::error::ErrorKind;

‎src/catalog/index.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,31 @@
11
// Copyright 2025 RisingLight Project Authors. Licensed under Apache-2.0.
22

33
use super::*;
4+
use crate::binder::IndexType;
45

56
/// The catalog of an index.
67
pub struct IndexCatalog {
78
id: IndexId,
89
name: String,
910
table_id: TableId,
1011
column_idxs: Vec<ColumnId>,
12+
index_type: IndexType,
1113
}
1214

1315
impl IndexCatalog {
14-
pub fn new(id: IndexId, name: String, table_id: TableId, column_idxs: Vec<ColumnId>) -> Self {
16+
pub fn new(
17+
id: IndexId,
18+
name: String,
19+
table_id: TableId,
20+
column_idxs: Vec<ColumnId>,
21+
index_type: IndexType,
22+
) -> Self {
1523
Self {
1624
id,
1725
name,
1826
table_id,
1927
column_idxs,
28+
index_type,
2029
}
2130
}
2231

@@ -35,4 +44,8 @@ impl IndexCatalog {
3544
pub fn name(&self) -> &str {
3645
&self.name
3746
}
47+
48+
pub fn index_type(&self) -> IndexType {
49+
self.index_type.clone()
50+
}
3851
}

‎src/catalog/root.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use std::sync::{Arc, Mutex};
55

66
use super::function::FunctionCatalog;
77
use super::*;
8+
use crate::binder::IndexType;
89
use crate::parser;
910
use crate::planner::RecExpr;
1011

@@ -104,10 +105,11 @@ impl RootCatalog {
104105
index_name: String,
105106
table_id: TableId,
106107
column_idxs: &[ColumnId],
108+
index_type: &IndexType,
107109
) -> Result<IndexId, CatalogError> {
108110
let mut inner = self.inner.lock().unwrap();
109111
let schema = inner.schemas.get_mut(&schema_id).unwrap();
110-
schema.add_index(index_name, table_id, column_idxs.to_vec())
112+
schema.add_index(index_name, table_id, column_idxs.to_vec(), index_type)
111113
}
112114

113115
pub fn get_index_on_table(&self, schema_id: SchemaId, table_id: TableId) -> Vec<IndexId> {

‎src/catalog/schema.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use std::sync::Arc;
55

66
use super::function::FunctionCatalog;
77
use super::*;
8+
use crate::binder::IndexType;
89
use crate::planner::RecExpr;
910

1011
/// The catalog of a schema.
@@ -62,13 +63,20 @@ impl SchemaCatalog {
6263
name: String,
6364
table_id: TableId,
6465
columns: Vec<ColumnId>,
66+
index_type: &IndexType,
6567
) -> Result<IndexId, CatalogError> {
6668
if self.indexes_idxs.contains_key(&name) {
6769
return Err(CatalogError::Duplicated("index", name));
6870
}
6971
let index_id = self.next_id;
7072
self.next_id += 1;
71-
let index_catalog = Arc::new(IndexCatalog::new(index_id, name.clone(), table_id, columns));
73+
let index_catalog = Arc::new(IndexCatalog::new(
74+
index_id,
75+
name.clone(),
76+
table_id,
77+
columns,
78+
index_type.clone(),
79+
));
7280
self.indexes_idxs.insert(name, index_id);
7381
self.indexes.insert(index_id, index_catalog);
7482
Ok(index_id)

‎src/executor/create_index.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ impl<S: Storage> CreateIndexExecutor<S> {
2121
&self.index.index_name,
2222
self.index.table_id,
2323
&self.index.columns,
24+
&self.index.index_type,
2425
)
2526
.await?;
2627

‎src/planner/cost.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ impl egg::CostFunction<Expr> for CostFn<'_> {
3131

3232
let c = match enode {
3333
// plan nodes
34-
Scan(_) | Values(_) => build(),
34+
Scan(_) | Values(_) | IndexScan(_) => build(),
3535
Order([_, c]) => nlogn(rows(c)) + build() + costs(c),
3636
Filter([exprs, c]) => costs(exprs) * rows(c) + build() + costs(c),
3737
Proj([exprs, c]) | Window([exprs, c]) => costs(exprs) * rows(c) + costs(c),

‎src/planner/explain.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,16 @@ impl<'a> Explain<'a> {
248248
("filter", self.expr(filter).pretty()),
249249
]),
250250
),
251+
IndexScan([table, columns, filter, key, vector]) => Pretty::childless_record(
252+
"IndexScan",
253+
with_meta(vec![
254+
("table", self.expr(table).pretty()),
255+
("columns", self.expr(columns).pretty()),
256+
("filter", self.expr(filter).pretty()),
257+
("key", self.expr(key).pretty()),
258+
("vector", self.expr(vector).pretty()),
259+
]),
260+
),
251261
Values(values) => Pretty::simple_record(
252262
"Values",
253263
with_meta(vec![("rows", Pretty::display(&values.len()))]),

‎src/planner/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ define_language! {
9898

9999
// plans
100100
"scan" = Scan([Id; 3]), // (scan table [column..] filter)
101+
"index_scan" = IndexScan([Id; 5]), // (index_scan table [column..] filter key value)
101102
"values" = Values(Box<[Id]>), // (values [expr..]..)
102103
"proj" = Proj([Id; 2]), // (proj [expr..] child)
103104
"filter" = Filter([Id; 2]), // (filter expr child)

‎src/planner/optimizer.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,14 @@ static STAGE1_RULES: LazyLock<Vec<Rewrite>> = LazyLock::new(|| {
121121
});
122122

123123
/// Stage2 rules in the optimizer.
124-
/// - pushdown predicate and projection
124+
/// - pushdown predicate, projection, and index scan
125125
static STAGE2_RULES: LazyLock<Vec<Rewrite>> = LazyLock::new(|| {
126126
let mut rules = vec![];
127127
rules.append(&mut rules::expr::rules());
128128
rules.append(&mut rules::plan::always_better_rules());
129129
rules.append(&mut rules::plan::predicate_pushdown_rules());
130130
rules.append(&mut rules::plan::projection_pushdown_rules());
131+
rules.append(&mut rules::plan::index_scan_rules());
131132
rules
132133
});
133134

‎src/planner/rules/plan.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ use itertools::Itertools;
66

77
use super::schema::schema_is_eq;
88
use super::*;
9+
use crate::binder::{IndexType, VectorDistance};
910
use crate::planner::ExprExt;
11+
use crate::types::DataValue;
1012

1113
/// Returns the rules that always improve the plan.
1214
pub fn always_better_rules() -> Vec<Rewrite> {
@@ -398,6 +400,80 @@ pub fn projection_pushdown_rules() -> Vec<Rewrite> { vec![
398400
),
399401
]}
400402

403+
/// Pushdown projections and prune unused columns.
404+
#[rustfmt::skip]
405+
pub fn index_scan_rules() -> Vec<Rewrite> { vec![
406+
rw!("vector-index-scan-1";
407+
"(order (list (<-> ?column ?vector)) (scan ?table ?columns ?filter))" =>
408+
"(index_scan ?table ?columns ?filter ?column ?vector)"
409+
if has_vector_index("?column", "<->", "?vector", "?filter")
410+
),
411+
rw!("vector-index-scan-2";
412+
"(order (list (<#> ?column ?vector)) (scan ?table ?columns ?filter))" =>
413+
"(index_scan ?table ?columns ?filter ?column ?vector)"
414+
if has_vector_index("?column", "<#>", "?vector", "?filter")
415+
),
416+
rw!("vector-index-scan-3";
417+
"(order (list (<=> ?column ?vector)) (scan ?table ?columns ?filter))" =>
418+
"(index_scan ?table ?columns ?filter ?column ?vector)"
419+
if has_vector_index("?column", "<=>", "?vector", "?filter")
420+
),
421+
]}
422+
423+
/// Check if there is a vector index matching the statement. i.e.,
424+
/// `SELECT * FROM t ORDER BY v <-> constant_vector` will match the index
425+
/// on the table t with the vector column v and using the `<->` distance function.
426+
fn has_vector_index(
427+
column: &str,
428+
op: &str,
429+
vector: &str,
430+
filter: &str,
431+
) -> impl Fn(&mut EGraph, Id, &Subst) -> bool {
432+
let column = var(column);
433+
let vector = var(vector);
434+
let filter = var(filter);
435+
let op = op.to_string();
436+
move |egraph, _, subst| {
437+
let filter = &egraph[subst[filter]].data;
438+
let vector = &egraph[subst[vector]].data;
439+
let column = &egraph[subst[column]].data;
440+
let Ok(vector_op) = op.parse::<VectorDistance>() else {
441+
return false;
442+
};
443+
// Only support null filter or always true filter for now. Check if the filter is null or
444+
// true.
445+
if !matches!(filter.constant, Some(DataValue::Bool(true)) | None) {
446+
return false;
447+
}
448+
if !matches!(vector.constant, Some(DataValue::Vector(_))) {
449+
return false;
450+
}
451+
// Check if the order by statement is in the form of vector column <-> constant vector
452+
if column.columns.len() != 1 {
453+
return false;
454+
}
455+
let column = column.columns.iter().next().unwrap();
456+
let Expr::Column(col) = column else {
457+
return false;
458+
};
459+
let catalog = &egraph.analysis.catalog;
460+
let indexes = catalog.get_index_on_table(col.schema_id, col.table_id);
461+
for index_id in indexes {
462+
// Check if any index matches the exact op and the column
463+
let index = catalog.get_index_by_id(col.schema_id, index_id).unwrap();
464+
if index.column_idxs() != [col.column_id] {
465+
continue;
466+
}
467+
if let IndexType::IvfFlat { distance, .. } = index.index_type() {
468+
if distance == vector_op {
469+
return true;
470+
}
471+
}
472+
}
473+
false
474+
}
475+
}
476+
401477
/// Returns true if the columns used in `expr` is disjoint from columns produced by `plan`.
402478
fn not_depend_on(expr: &str, plan: &str) -> impl Fn(&mut EGraph, Id, &Subst) -> bool {
403479
let expr = var(expr);

‎src/storage/memory/mod.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use std::sync::{Arc, Mutex};
2222

2323
use super::index::InMemoryIndexes;
2424
use super::{InMemoryIndex, Storage, StorageError, StorageResult, TracedStorageError};
25+
use crate::binder::IndexType;
2526
use crate::catalog::{
2627
ColumnCatalog, ColumnId, IndexId, RootCatalog, RootCatalogRef, SchemaId, TableId, TableRefId,
2728
};
@@ -133,10 +134,17 @@ impl Storage for InMemoryStorage {
133134
index_name: &str,
134135
table_id: TableId,
135136
column_idxs: &[ColumnId],
137+
index_type: &IndexType,
136138
) -> StorageResult<IndexId> {
137139
let idx_id = self
138140
.catalog
139-
.add_index(schema_id, index_name.to_string(), table_id, column_idxs)
141+
.add_index(
142+
schema_id,
143+
index_name.to_string(),
144+
table_id,
145+
column_idxs,
146+
index_type,
147+
)
140148
.map_err(|_| StorageError::Duplicated("index", index_name.into()))?;
141149
self.indexes
142150
.lock()

‎src/storage/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ pub use chunk::*;
2424
use enum_dispatch::enum_dispatch;
2525

2626
use crate::array::{ArrayImpl, DataChunk};
27+
use crate::binder::IndexType;
2728
use crate::catalog::{
2829
ColumnCatalog, ColumnId, IndexId, RootCatalog, SchemaId, TableId, TableRefId,
2930
};
@@ -93,6 +94,7 @@ pub trait Storage: Sync + Send + 'static {
9394
index_name: &str,
9495
table_id: TableId,
9596
column_idxs: &[ColumnId],
97+
index_type: &IndexType,
9698
) -> impl Future<Output = StorageResult<IndexId>> + Send;
9799

98100
/// Get the catalog of the storage engine.

‎src/storage/secondary/mod.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ use version_manager::*;
3434

3535
use super::index::InMemoryIndexes;
3636
use super::{InMemoryIndex, Storage, StorageError, StorageResult, TracedStorageError};
37+
use crate::binder::IndexType;
3738
use crate::catalog::{
3839
ColumnCatalog, ColumnId, IndexId, RootCatalog, RootCatalogRef, SchemaId, TableId, TableRefId,
3940
};
@@ -200,10 +201,17 @@ impl Storage for SecondaryStorage {
200201
index_name: &str,
201202
table_id: TableId,
202203
column_idxs: &[ColumnId],
204+
index_type: &IndexType,
203205
) -> StorageResult<IndexId> {
204206
let idx_id = self
205207
.catalog
206-
.add_index(schema_id, index_name.to_string(), table_id, column_idxs)
208+
.add_index(
209+
schema_id,
210+
index_name.to_string(),
211+
table_id,
212+
column_idxs,
213+
index_type,
214+
)
207215
.map_err(|_| StorageError::Duplicated("index", index_name.into()))?;
208216
self.indexes
209217
.lock()

‎tests/planner_test/vector.planner.sql

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
-- match the index
2+
explain select * from t order by a <-> '[0, 0, 1]'::VECTOR(3);
3+
4+
/*
5+
IndexScan { table: t, columns: [ a, b ], filter: true, key: a, vector: [0,0,1], cost: 0, rows: 1 }
6+
*/
7+
8+
-- match the index
9+
explain select * from t order by a <=> '[0, 0, 1]'::VECTOR(3);
10+
11+
/*
12+
Order { by: [ VectorCosineDistance { lhs: a, rhs: [0,0,1] } ], cost: 18, rows: 3 }
13+
└── Scan { table: t, list: [ a, b ], filter: true, cost: 6, rows: 3 }
14+
*/
15+

‎tests/planner_test/vector.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
- sql: |
2+
explain select * from t order by a <-> '[0, 0, 1]'::VECTOR(3);
3+
desc: match the index
4+
before:
5+
- CREATE TABLE t (a vector(3) not null, b text not null);
6+
INSERT INTO t VALUES ('[0, 0, 1]', 'a'), ('[0, 0, 2]', 'b'), ('[0, 0, 3]', 'c');
7+
CREATE INDEX t_ivfflat ON t USING ivfflat (a) WITH (distfn = '<->', nlists = 3, nprobe = 2);
8+
tasks:
9+
- print
10+
- sql: |
11+
explain select * from t order by a <=> '[0, 0, 1]'::VECTOR(3);
12+
desc: match the index
13+
before:
14+
- CREATE TABLE t (a vector(3) not null, b text not null);
15+
INSERT INTO t VALUES ('[0, 0, 1]', 'a'), ('[0, 0, 2]', 'b'), ('[0, 0, 3]', 'c');
16+
CREATE INDEX t_ivfflat ON t USING ivfflat (a) WITH (distfn = '<->', nlists = 3, nprobe = 2);
17+
tasks:
18+
- print

‎tests/sql/catalog.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ query ITIT rowsort
1212
1 postgres 0 t
1313

1414
statement ok
15-
create index i1 on t(v1)
15+
create index i1 on t using btree (v1)
1616

1717
query ITITITT rowsort
1818
\di

‎tests/sql/vector_index.slt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# vector_index
2+
statement ok
3+
create table t (a vector(3) not null, b text not null);
4+
5+
statement ok
6+
insert into t values ('[-1, -2.0, -3]', 'a'), ('[1, 2.0, 3]', 'b');
7+
8+
query RRR
9+
select * from t order by a <-> '[0, 0, 1]'::VECTOR(3);
10+
----
11+
[1,2,3] b
12+
[-1,-2,-3] a
13+
14+
statement ok
15+
CREATE INDEX t_ivfflat ON t USING ivfflat (a) WITH (distfn = 'l2', nlists = 3, nprobe = 2);
16+
17+
statement ok
18+
drop table t

0 commit comments

Comments
 (0)
Please sign in to comment.