Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion packages/backend/native/src/doc_loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ impl Document {
fn chunks(&self, env: Env) -> Result<JsObject> {
let mut array = env.create_array_with_length(self.inner.chunks.len())?;
for (i, chunk) in self.inner.chunks.iter().enumerate() {
let content = crate::utils::clean_content(&chunk.content);

let mut obj = env.create_object()?;
obj.set_named_property("index", i as i64)?;
obj.set_named_property("content", chunk.content.clone())?;
obj.set_named_property("content", content)?;
array.set_element(i as u32, obj)?;
}
Ok(array)
Expand Down
2 changes: 2 additions & 0 deletions packages/backend/native/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#![deny(clippy::all)]

mod utils;

pub mod doc_loader;
pub mod file_type;
pub mod hashcash;
Expand Down
111 changes: 111 additions & 0 deletions packages/backend/native/src/utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
fn collapse_whitespace(s: &str) -> String {
let mut result = String::new();
let mut prev_was_whitespace = false;
for c in s.chars() {
if c.is_whitespace() {
if !prev_was_whitespace {
result.push(' ');
prev_was_whitespace = true;
}
} else {
result.push(c);
prev_was_whitespace = false;
}
}
result
}

fn try_remove_label(s: &str, i: usize) -> Option<usize> {
let mut next_idx = match s[i..].to_ascii_lowercase() {
s if s.starts_with("figure") => i + 6,
s if s.starts_with("table") => i + 5,
_ => return None,
};

if next_idx >= s.len() {
return None;
}

if let Some(ch) = s[next_idx..].chars().next() {
if !ch.is_whitespace() {
return None;
}
} else {
return None;
}

while next_idx < s.len() {
let ch = s[next_idx..].chars().next()?;
if ch.is_whitespace() {
next_idx += ch.len_utf8();
} else {
break;
}
}

let start_digits = next_idx;
while next_idx < s.len() {
let ch = s[next_idx..].chars().next()?;
if ch.is_ascii_digit() {
next_idx += ch.len_utf8();
} else {
break;
}
}

if next_idx == start_digits {
return None;
}

if let Some(ch) = s[next_idx..].chars().next() {
if ch == '.' {
next_idx += ch.len_utf8();
return Some(next_idx);
}
}
None
}

fn remove_label(s: &str) -> String {
let mut result = String::with_capacity(s.len());
let mut i = 0;
while i < s.len() {
if let Some(next_idx) = try_remove_label(s, i) {
i = next_idx;
continue;
}

let ch = s[i..].chars().next().unwrap();
result.push(ch);
i += ch.len_utf8();
}
result
}

pub fn clean_content(content: &str) -> String {
let content = content.replace("\x00", "");
remove_label(&collapse_whitespace(&content))
.trim()
.to_string()
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_clean_input() {
let inputs = [
"FIGURE 1. This is a\t test\n\nwith multiple lines",
"table 2. Another test\x00 with null",
"Some text \t\n without label",
];
let cleaned = [
"This is a test with multiple lines",
"Another test with null",
"Some text without label",
];

assert_eq!(cleaned, inputs.map(clean_content));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
DO $$
DECLARE error_message TEXT;
BEGIN -- check if pgvector extension is installed
IF NOT EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'vector') THEN
BEGIN
-- CreateExtension
CREATE EXTENSION IF NOT EXISTS "vector";
EXCEPTION
WHEN OTHERS THEN
-- if not found and cannot create extension, raise the exception
error_message := 'pgvector extension not found.' || E'\n' ||
'****************************************************************************' || E'\n' ||
'* *' || E'\n' ||
'* NOTICE: From AFFiNE 0.20 onwards, the copilot module will depend *' || E'\n' ||
'* on pgvector. *' || E'\n' ||
'* *' || E'\n' ||
'* 1. If you are using the official PostgreSQL Docker container, *' || E'\n' ||
'* please switch to the pgvector/pgvector:pg${VERSION} container, *' || E'\n' ||
'* where ${VERSION} is the major version of your PostgreSQL container. *' || E'\n' ||
'* *' || E'\n' ||
'* 2. If you are using a self-installed PostgreSQL, please follow the *' || E'\n' ||
'* the official pgvector installation guide to install it into your *' || E'\n' ||
'* database: https://github.com/pgvector/pgvector?tab=readme-ov- *' || E'\n' ||
'* file#installation-notes---linux-and-mac *' || E'\n' ||
'* *' || E'\n' ||
'****************************************************************************';

RAISE WARNING '%', error_message;
END;
END IF;
-- check again, initialize the tables if the extension is installed
IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'vector') THEN
-- CreateTable
CREATE TABLE "ai_context_embeddings" (
"id" VARCHAR NOT NULL,
"context_id" VARCHAR NOT NULL,
"file_id" VARCHAR NOT NULL,
"chunk" INTEGER NOT NULL,
"content" VARCHAR NOT NULL,
"embedding" vector(512) NOT NULL,
"created_at" TIMESTAMPTZ(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updated_at" TIMESTAMPTZ(3) NOT NULL,

CONSTRAINT "ai_context_embeddings_pkey" PRIMARY KEY ("id")
);

-- CreateTable
CREATE TABLE "ai_workspace_embeddings" (
"workspace_id" VARCHAR NOT NULL,
"doc_id" VARCHAR NOT NULL,
"chunk" INTEGER NOT NULL,
"content" VARCHAR NOT NULL,
"embedding" vector(512) NOT NULL,
"created_at" TIMESTAMPTZ(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updated_at" TIMESTAMPTZ(3) NOT NULL,

CONSTRAINT "ai_workspace_embeddings_pkey" PRIMARY KEY ("workspace_id","doc_id")
);

-- CreateIndex
CREATE INDEX IF NOT EXISTS "ai_context_embeddings_idx" ON ai_context_embeddings USING hnsw (embedding vector_cosine_ops);

-- CreateIndex
CREATE UNIQUE INDEX "ai_context_embeddings_context_id_file_id_chunk_key" ON "ai_context_embeddings"("context_id", "file_id", "chunk");

-- CreateIndex
CREATE INDEX IF NOT EXISTS "ai_workspace_embeddings_idx" ON ai_workspace_embeddings USING hnsw (embedding vector_cosine_ops);

-- AddForeignKey
ALTER TABLE "ai_context_embeddings" ADD CONSTRAINT "ai_context_embeddings_context_id_fkey" FOREIGN KEY ("context_id") REFERENCES "ai_contexts"("id") ON DELETE CASCADE ON UPDATE CASCADE;

-- AddForeignKey
ALTER TABLE "ai_workspace_embeddings" ADD CONSTRAINT "ai_workspace_embeddings_workspace_id_doc_id_fkey" FOREIGN KEY ("workspace_id", "doc_id") REFERENCES "snapshots"("workspace_id", "guid") ON DELETE CASCADE ON UPDATE CASCADE;
END IF;
END $$;
51 changes: 47 additions & 4 deletions packages/backend/server/schema.prisma
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
generator client {
provider = "prisma-client-js"
binaryTargets = ["native", "debian-openssl-3.0.x", "linux-arm64-openssl-3.0.x"]
previewFeatures = ["metrics", "relationJoins", "nativeDistinct"]
previewFeatures = ["metrics", "relationJoins", "nativeDistinct", "postgresqlExtensions"]
}

datasource db {
provider = "postgresql"
url = env("DATABASE_URL")
provider = "postgresql"
url = env("DATABASE_URL")
extensions = [pgvector(map: "vector")]
}

model User {
Expand Down Expand Up @@ -281,6 +282,8 @@ model Snapshot {
// we need to clear all hanging updates and snapshots before enable the foreign key on workspaceId
// workspace Workspace @relation(fields: [workspaceId], references: [id], onDelete: Cascade)

embedding AiWorkspaceEmbedding?

@@id([workspaceId, id])
@@index([workspaceId, updatedAt])
@@map("snapshots")
Expand Down Expand Up @@ -426,11 +429,51 @@ model AiContext {
createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(3)
updatedAt DateTime @updatedAt @map("updated_at") @db.Timestamptz(3)

session AiSession @relation(fields: [sessionId], references: [id], onDelete: Cascade)
embeddings AiContextEmbedding[]
session AiSession @relation(fields: [sessionId], references: [id], onDelete: Cascade)

@@map("ai_contexts")
}

model AiContextEmbedding {
id String @id @default(uuid()) @db.VarChar
contextId String @map("context_id") @db.VarChar
fileId String @map("file_id") @db.VarChar
// a file can be divided into multiple chunks and embedded separately.
chunk Int @db.Integer
content String @db.VarChar
embedding Unsupported("vector(512)")

createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(3)
updatedAt DateTime @updatedAt @map("updated_at") @db.Timestamptz(3)

context AiContext @relation(fields: [contextId], references: [id], onDelete: Cascade)

@@unique([contextId, fileId, chunk])
@@index([embedding], map: "ai_context_embeddings_idx")
@@map("ai_context_embeddings")
}

model AiWorkspaceEmbedding {
workspaceId String @map("workspace_id") @db.VarChar
docId String @map("doc_id") @db.VarChar
// a doc can be divided into multiple chunks and embedded separately.
chunk Int @db.Integer
content String @db.VarChar
embedding Unsupported("vector(512)")

createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(3)
updatedAt DateTime @updatedAt @map("updated_at") @db.Timestamptz(3)

// workspace level search not available for non-cloud workspaces
// so we can match this record with the snapshot one by one
snapshot Snapshot @relation(fields: [workspaceId, docId], references: [workspaceId, id], onDelete: Cascade)

@@id([workspaceId, docId])
@@index([embedding], map: "ai_workspace_embeddings_idx")
@@map("ai_workspace_embeddings")
}

model DataMigration {
id String @id @default(uuid()) @db.VarChar
name String @unique @db.VarChar
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,14 @@ Generated by [AVA](https://avajs.dev).
id: 'docId1',
},
]

> should list context docs

[
{
blobId: 'fileId1',
chunkSize: 0,
name: 'sample.pdf',
status: 'processing',
},
]
Binary file not shown.
Loading
Loading