Skip to content

Commit

Permalink
Iterating on upgrade code
Browse files Browse the repository at this point in the history
  • Loading branch information
tjgreen42 committed Nov 14, 2024
1 parent 4dcc2bb commit 96c29ea
Show file tree
Hide file tree
Showing 2 changed files with 281 additions and 15 deletions.
130 changes: 116 additions & 14 deletions pgvectorscale/src/access_method/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,23 +88,125 @@ fn amhandler(_fcinfo: pg_sys::FunctionCallInfo) -> PgBox<pg_sys::IndexAmRoutine>
amroutine.into_pg_boxed()
}

// Background on system catalog state needed to understand the SQL for idempotent install/upgrade
// ----------------------------------------------------------------------------------------------
//
// When installing from scratch, we execute:
//
// CREATE OPERATOR CLASS vector_cosine_ops
// DEFAULT FOR TYPE vector USING diskann AS
// OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops,
// FUNCTION 1 distance_type_cosine();
//
// This creates the following system catalog state:
//
// (1) A row in pg_opclass for vector_l2_ops and diskann:
//
// oid | opcmethod | opcname | opcnamespace | opcowner | opcfamily | opcintype | opcdefault | opckeytype
// -------+-----------+-------------------+--------------+----------+-----------+-----------+------------+------------
// 17722 | 17718 | vector_cosine_ops | 2200 | 10 | 17721 | 17389 | t | 0
//
// Note: opcmethod is the oid of the access method (diskann) already in pg_am.
// Also: note that opcdefault is t, which means that this is the default operator class for the type.
//
// (2) A row in pg_amop for the <=> operator:
// oid | amopfamily | amoplefttype | amoprighttype | amopstrategy | amoppurpose | amopopr | amopmethod | amopsortfamily
// -------+------------+--------------+---------------+--------------+-------------+---------+------------+----------------
// 17723 | 17721 | 17389 | 17389 | 1 | o | 17438 | 17718 | 1970
//
// (3) A row in pg_amproc for the distance_type_cosine function:
//
// oid | amprocfamily | amproclefttype | amprocrighttype | amprocnum | amproc
// -------+--------------+----------------+-----------------+-----------+----------------------
// 17724 | 17721 | 17389 | 17389 | 1 | distance_type_cosine
//
// Version 0.4.0 contained the same SQL as above, but without the FUNCTION 1 part:
//
// CREATE OPERATOR CLASS vector_cosine_ops
// DEFAULT FOR TYPE vector USING diskann AS
// OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops;
//
// Thus, when upgrading from 0.4.0 to 0.5.0, we need to add the appropriate entry in `pg_amproc`.
//
// Similarly, here is the sample system catalog state created by:
//
// CREATE OPERATOR CLASS vector_l2_ops
// FOR TYPE vector USING diskann AS
// OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
// FUNCTION 1 distance_type_l2();
//
// (1) A row in pg_opclass for vector_l2_ops and diskann:
//
// oid | opcmethod | opcname | opcnamespace | opcowner | opcfamily | opcintype | opcdefault | opckeytype
// -------+-----------+---------------+--------------+----------+-----------+-----------+------------+------------
// 17726 | 17718 | vector_l2_ops | 2200 | 10 | 17725 | 17389 | f | 0
//
// Note: opcmethod is the oid of the access method (diskann) already in pg_am.
// Also: note that opcdefault is f, which means that this is not the default operator class for the type.
//
// (2) A row in pg_amop for the <-> operator:
//
// oid | amopfamily | amoplefttype | amoprighttype | amopstrategy | amoppurpose | amopopr | amopmethod | amopsortfamily
// -------+------------+--------------+---------------+--------------+-------------+---------+------------+----------------
// 17727 | 17725 | 17389 | 17389 | 1 | o | 17436 | 17718 | 1970
//
// (3) A row in pg_amproc for the distance_type_l2 function:
//
// oid | amprocfamily | amproclefttype | amprocrighttype | amprocnum | amproc
// -------+--------------+----------------+-----------------+-----------+------------------
// 17728 | 17725 | 17389 | 17389 | 1 | distance_type_l2
//
// However, the situation is easier for upgrade. Version 0.4.0 did not contain support for the L2 distance, so we can
// just run the CREATE OPERATOR CLASS statement above to add the L2 distance support.

// This SQL is made idempotent so that we can use the same script for the installation and the upgrade.
extension_sql!(
r#"
DROP OPERATOR CLASS IF EXISTS vector_cosine_ops USING diskann;
CREATE OPERATOR CLASS vector_cosine_ops
DEFAULT FOR TYPE vector USING diskann AS
OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 distance_type_cosine();
DROP OPERATOR CLASS IF EXISTS vector_l2_ops USING diskann;
CREATE OPERATOR CLASS vector_l2_ops
FOR TYPE vector USING diskann AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 distance_type_l2();
DO $$
DECLARE
c int;
d int;
BEGIN
-- Has cosine operator class been installed previously?
SELECT count(*)
INTO c
FROM pg_catalog.pg_opclass c
WHERE c.opcname = 'vector_cosine_ops'
AND c.opcmethod = (SELECT oid FROM pg_catalog.pg_am am WHERE am.amname = 'diskann');
-- Has L2 operator class been installed previously?
SELECT count(*)
INTO d
FROM pg_catalog.pg_opclass c
WHERE c.opcname = 'vector_l2_ops'
AND c.opcmethod = (SELECT oid FROM pg_catalog.pg_am am WHERE am.amname = 'diskann');
IF c = 0 THEN
-- Fresh install from scratch
CREATE OPERATOR CLASS vector_cosine_ops DEFAULT
FOR TYPE vector USING diskann AS
OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 distance_type_cosine();
CREATE OPERATOR CLASS vector_l2_ops
FOR TYPE vector USING diskann AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 distance_type_l2();
ELSIF d = 0 THEN
-- Upgrade to add L2 distance support and update cosine opclass to
-- include the distance_type_cosine function
INSERT INTO pg_amproc (amprocfamily, amproclefttype, amprocrighttype, amprocnum, amproc)
SELECT c.opcfamily, c.opcintype, c.opcintype, 1, 'distance_type_l2'
FROM pg_opclass c, pg_am a
WHERE a.oid = c.opcmethod AND c.opcname = 'vector_l2_ops' AND a.amname = 'diskann';
CREATE OPERATOR CLASS vector_l2_ops
FOR TYPE vector USING diskann AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 distance_type_l2();
END IF;
END;
$$;
"#,
name = "diskann_ops_operator",
requires = [amhandler, distance_type_cosine, distance_type_l2]
Expand Down
166 changes: 165 additions & 1 deletion pgvectorscale/src/access_method/upgrade_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pub mod tests {
#[test]
#[ignore]
///This function is only a mock to bring up the test framewokr in test_delete_vacuum
fn test_upgrade() {
fn test_upgrade_from_0_0_2() {
if cfg!(feature = "pg17") {
// PG17 is only supported for one version
return;
Expand Down Expand Up @@ -210,4 +210,168 @@ pub mod tests {
))
.unwrap();
}

#[test]
// #[ignore]
fn test_upgrade_from_0_4_0() {
if cfg!(feature = "pg17") {
// PG17 is only supported for one version
return;
}
pgrx_tests::run_test(
"test_delete_mock_fn",
None,
crate::pg_test::postgresql_conf_options(),
)
.unwrap();

let (mut client, _) = pgrx_tests::client().unwrap();

client
.execute(
&"DROP EXTENSION IF EXISTS vectorscale CASCADE;".to_string(),
&[],
)
.unwrap();

let current_file = file!();

// Convert the file path to an absolute path
let current_dir = std::env::current_dir().unwrap();
let mut absolute_path = std::path::Path::new(&current_dir).join(current_file);
absolute_path = absolute_path.ancestors().nth(4).unwrap().to_path_buf();

let temp_dir = tempfile::tempdir().unwrap();
let temp_path = temp_dir.path();

copy_dir_all(absolute_path.clone(), temp_dir.path()).unwrap();

let pgrx = pgrx_pg_config::Pgrx::from_config().unwrap();
let pg_version = pg_sys::get_pg_major_version_num();
let pg_config = pgrx.get(&format!("pg{}", pg_version)).unwrap();

let version = "0.4.0";
let res = std::process::Command::new("git")
.current_dir(temp_path)
.arg("checkout")
.arg("-f")
.arg(version)
.output()
.unwrap();
assert!(
res.status.success(),
"failed: {:?} {:?} {:?}",
res,
absolute_path,
temp_dir.path()
);

let res = std::process::Command::new("cargo")
.current_dir(temp_path.join("pgvectorscale"))
.arg("pgrx")
.arg("install")
.arg("--test")
.arg("--pg-config")
.arg(pg_config.path().unwrap())
.stdout(Stdio::inherit())
.stderr(Stdio::piped())
.output()
.unwrap();
assert!(res.status.success(), "failed: {:?}", res);

client
.execute(
&format!(
"CREATE EXTENSION vectorscale VERSION '{}' CASCADE;",
version
),
&[],
)
.unwrap();

let suffix = (1..=253)
.map(|i| format!("{}", i))
.collect::<Vec<String>>()
.join(", ");

client
.batch_execute(&format!(
"CREATE TABLE test(embedding vector(256));
select setseed(0.5);
-- generate 300 vectors
INSERT INTO test(embedding)
SELECT
*
FROM (
SELECT
('[ 0 , ' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
FROM
generate_series(1, 255 * 300) i
GROUP BY
i % 300) g;
INSERT INTO test(embedding) VALUES ('[1,2,3,{suffix}]'), ('[4,5,6,{suffix}]'), ('[7,8,10,{suffix}]');
CREATE INDEX idxtest
ON test
USING diskann(embedding);
"
))
.unwrap();

client.execute("set enable_seqscan = 0;", &[]).unwrap();
let cnt: i64 = client.query_one(&format!("WITH cte as (select * from test order by embedding <=> '[1,1,1,{suffix}]') SELECT count(*) from cte;"), &[]).unwrap().get(0);
assert_eq!(cnt, 303, "count before upgrade");

//reinstall myself
let res = std::process::Command::new("cargo")
.arg("pgrx")
.arg("install")
.arg("--test")
.arg("--pg-config")
.arg(pg_config.path().unwrap())
.stdout(Stdio::inherit())
.stderr(Stdio::piped())
.output()
.unwrap();
assert!(res.status.success(), "failed: {:?}", res);

client
.execute(
&"UPDATE pg_extension SET extname='vectorscale' WHERE extname = 'timescale_vector';".to_string(),
&[],
)
.unwrap();

//need to recreate the client to avoid double load of GUC. Look into this later.
let (mut client, _) = pgrx_tests::client().unwrap();
client
.execute(
&format!(
"ALTER EXTENSION vectorscale UPDATE TO '{}'",
env!("CARGO_PKG_VERSION")
),
&[],
)
.unwrap();

client.execute("set enable_seqscan = 0;", &[]).unwrap();
let cnt: i64 = client.query_one(&format!("WITH cte as (select * from test order by embedding <=> '[1,1,1,{suffix}]') SELECT count(*) from cte;"), &[]).unwrap().get(0);
assert_eq!(cnt, 303, "count after upgrade");

client
.batch_execute(&format!(
"DROP INDEX idxtest;
CREATE INDEX idxtest_cosine
ON test
USING diskann(embedding vector_cosine_ops);
CREATE INDEX idxtest_l2
ON test
USING diskann(embedding vector_l2_ops);"
))
.unwrap();
}
}

0 comments on commit 96c29ea

Please sign in to comment.