forked from lvwzhen/law-cn-ai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
20230406025118_init.sql
78 lines (68 loc) · 2.29 KB
/
20230406025118_init.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
-- Enable pgvector extension
create extension if not exists vector with schema public;
-- Create tables
create table "public"."nods_page" (
id bigserial primary key,
parent_page_id bigint references public.nods_page,
path text not null unique,
checksum text,
meta jsonb,
type text,
source text
);
alter table "public"."nods_page" enable row level security;
create table "public"."nods_page_section" (
id bigserial primary key,
page_id bigint not null references public.nods_page on delete cascade,
content text,
token_count int,
embedding vector(1536),
slug text,
heading text
);
alter table "public"."nods_page_section" enable row level security;
-- Create embedding similarity search functions
create or replace function match_page_sections(embedding vector(1536), match_threshold float, match_count int, min_content_length int)
returns table (id bigint, page_id bigint, slug text, heading text, content text, similarity float)
language plpgsql
as $$
#variable_conflict use_variable
begin
return query
select
nods_page_section.id,
nods_page_section.page_id,
nods_page_section.slug,
nods_page_section.heading,
nods_page_section.content,
(nods_page_section.embedding <#> embedding) * -1 as similarity
from nods_page_section
-- We only care about sections that have a useful amount of content
where length(nods_page_section.content) >= min_content_length
-- The dot product is negative because of a Postgres limitation, so we negate it
and (nods_page_section.embedding <#> embedding) * -1 > match_threshold
-- OpenAI embeddings are normalized to length 1, so
-- cosine similarity and dot product will produce the same results.
-- Using dot product which can be computed slightly faster.
--
-- For the different syntaxes, see https://github.com/pgvector/pgvector
order by nods_page_section.embedding <#> embedding
limit match_count;
end;
$$;
create or replace function get_page_parents(page_id bigint)
returns table (id bigint, parent_page_id bigint, path text, meta jsonb)
language sql
as $$
with recursive chain as (
select *
from nods_page
where id = page_id
union all
select child.*
from nods_page as child
join chain on chain.parent_page_id = child.id
)
select id, parent_page_id, path, meta
from chain;
$$;