toeverything · graphite-app · Mar 13, 2025 · Mar 13, 2025
diff --git a/packages/backend/native/src/doc_loader.rs b/packages/backend/native/src/doc_loader.rs
@@ -17,9 +17,11 @@ impl Document {
   fn chunks(&self, env: Env) -> Result<JsObject> {
     let mut array = env.create_array_with_length(self.inner.chunks.len())?;
     for (i, chunk) in self.inner.chunks.iter().enumerate() {
+      let content = crate::utils::clean_content(&chunk.content);
+
       let mut obj = env.create_object()?;
       obj.set_named_property("index", i as i64)?;
-      obj.set_named_property("content", chunk.content.clone())?;
+      obj.set_named_property("content", content)?;
       array.set_element(i as u32, obj)?;
     }
     Ok(array)

diff --git a/packages/backend/native/src/lib.rs b/packages/backend/native/src/lib.rs
@@ -1,5 +1,7 @@
 #![deny(clippy::all)]
 
+mod utils;
+
 pub mod doc_loader;
 pub mod file_type;
 pub mod hashcash;

diff --git a/packages/backend/native/src/utils.rs b/packages/backend/native/src/utils.rs
@@ -0,0 +1,111 @@
+fn collapse_whitespace(s: &str) -> String {
+  let mut result = String::new();
+  let mut prev_was_whitespace = false;
+  for c in s.chars() {
+    if c.is_whitespace() {
+      if !prev_was_whitespace {
+        result.push(' ');
+        prev_was_whitespace = true;
+      }
+    } else {
+      result.push(c);
+      prev_was_whitespace = false;
+    }
+  }
+  result
+}
+
+fn try_remove_label(s: &str, i: usize) -> Option<usize> {
+  let mut next_idx = match s[i..].to_ascii_lowercase() {
+    s if s.starts_with("figure") => i + 6,
+    s if s.starts_with("table") => i + 5,
+    _ => return None,
+  };
+
+  if next_idx >= s.len() {
+    return None;
+  }
+
+  if let Some(ch) = s[next_idx..].chars().next() {
+    if !ch.is_whitespace() {
+      return None;
+    }
+  } else {
+    return None;
+  }
+
+  while next_idx < s.len() {
+    let ch = s[next_idx..].chars().next()?;
+    if ch.is_whitespace() {
+      next_idx += ch.len_utf8();
+    } else {
+      break;
+    }
+  }
+
+  let start_digits = next_idx;
+  while next_idx < s.len() {
+    let ch = s[next_idx..].chars().next()?;
+    if ch.is_ascii_digit() {
+      next_idx += ch.len_utf8();
+    } else {
+      break;
+    }
+  }
+
+  if next_idx == start_digits {
+    return None;
+  }
+
+  if let Some(ch) = s[next_idx..].chars().next() {
+    if ch == '.' {
+      next_idx += ch.len_utf8();
+      return Some(next_idx);
+    }
+  }
+  None
+}
+
+fn remove_label(s: &str) -> String {
+  let mut result = String::with_capacity(s.len());
+  let mut i = 0;
+  while i < s.len() {
+    if let Some(next_idx) = try_remove_label(s, i) {
+      i = next_idx;
+      continue;
+    }
+
+    let ch = s[i..].chars().next().unwrap();
+    result.push(ch);
+    i += ch.len_utf8();
+  }
+  result
+}
+
+pub fn clean_content(content: &str) -> String {
+  let content = content.replace("\x00", "");
+  remove_label(&collapse_whitespace(&content))
+    .trim()
+    .to_string()
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  #[test]
+  fn test_clean_input() {
+    let inputs = [
+      "FIGURE 1. This is a\t test\n\nwith multiple lines",
+      "table 2. Another test\x00 with null",
+      "Some text \t\n without       label",
+    ];
+    let cleaned = [
+      "This is a test with multiple lines",
+      "Another test with null",
+      "Some text without label",
+    ];
+
+    assert_eq!(cleaned, inputs.map(clean_content));
+  }
+}
diff --git a/packages/backend/server/migrations/20250210090228_ai_context_embedding/migration.sql b/packages/backend/server/migrations/20250210090228_ai_context_embedding/migration.sql
@@ -0,0 +1,75 @@
+DO $$
+DECLARE error_message TEXT;
+BEGIN -- check if pgvector extension is installed
+  IF NOT EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'vector') THEN
+    BEGIN
+      -- CreateExtension
+      CREATE EXTENSION IF NOT EXISTS "vector";
+    EXCEPTION
+      WHEN OTHERS THEN
+        -- if not found and cannot create extension, raise the exception
+        error_message := 'pgvector extension not found.' || E'\n' ||
+        '****************************************************************************' || E'\n' ||
+        '*                                                                          *' || E'\n' ||
+        '*   NOTICE: From AFFiNE 0.20 onwards, the copilot module will depend       *' || E'\n' ||
+        '*           on pgvector.                                                   *' || E'\n' ||
+        '*                                                                          *' || E'\n' ||
+        '*   1. If you are using the official PostgreSQL Docker container,          *' || E'\n' ||
+        '*      please switch to the pgvector/pgvector:pg${VERSION} container,      *' || E'\n' ||
+        '*      where ${VERSION} is the major version of your PostgreSQL container. *' || E'\n' ||
+        '*                                                                          *' || E'\n' ||
+        '*   2. If you are using a self-installed PostgreSQL, please follow the     *' || E'\n' ||
+        '*      the official pgvector installation guide to install it into your    *' || E'\n' ||
+        '*      database: https://github.com/pgvector/pgvector?tab=readme-ov-       *' || E'\n' ||
+        '*      file#installation-notes---linux-and-mac                             *' || E'\n' ||
+        '*                                                                          *' || E'\n' ||
+        '****************************************************************************';
+
+        RAISE WARNING '%', error_message;
+    END;
+  END IF;
+  -- check again, initialize the tables if the extension is installed
+  IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'vector') THEN
+    -- CreateTable
+    CREATE TABLE "ai_context_embeddings" (
+        "id" VARCHAR NOT NULL,
+        "context_id" VARCHAR NOT NULL,
+        "file_id" VARCHAR NOT NULL,
+        "chunk" INTEGER NOT NULL,
+        "content" VARCHAR NOT NULL,
+        "embedding" vector(512) NOT NULL,
+        "created_at" TIMESTAMPTZ(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        "updated_at" TIMESTAMPTZ(3) NOT NULL,
+
+        CONSTRAINT "ai_context_embeddings_pkey" PRIMARY KEY ("id")
+    );
+
+    -- CreateTable
+    CREATE TABLE "ai_workspace_embeddings" (
+        "workspace_id" VARCHAR NOT NULL,
+        "doc_id" VARCHAR NOT NULL,
+        "chunk" INTEGER NOT NULL,
+        "content" VARCHAR NOT NULL,
+        "embedding" vector(512) NOT NULL,
+        "created_at" TIMESTAMPTZ(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        "updated_at" TIMESTAMPTZ(3) NOT NULL,
+
+        CONSTRAINT "ai_workspace_embeddings_pkey" PRIMARY KEY ("workspace_id","doc_id")
+    );
+
+    -- CreateIndex
+    CREATE INDEX IF NOT EXISTS "ai_context_embeddings_idx" ON ai_context_embeddings USING hnsw (embedding vector_cosine_ops);
+
+    -- CreateIndex
+    CREATE UNIQUE INDEX "ai_context_embeddings_context_id_file_id_chunk_key" ON "ai_context_embeddings"("context_id", "file_id", "chunk");
+
+    -- CreateIndex
+    CREATE INDEX IF NOT EXISTS "ai_workspace_embeddings_idx" ON ai_workspace_embeddings USING hnsw (embedding vector_cosine_ops);
+
+    -- AddForeignKey
+    ALTER TABLE "ai_context_embeddings" ADD CONSTRAINT "ai_context_embeddings_context_id_fkey" FOREIGN KEY ("context_id") REFERENCES "ai_contexts"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+    -- AddForeignKey
+    ALTER TABLE "ai_workspace_embeddings" ADD CONSTRAINT "ai_workspace_embeddings_workspace_id_doc_id_fkey" FOREIGN KEY ("workspace_id", "doc_id") REFERENCES "snapshots"("workspace_id", "guid") ON DELETE CASCADE ON UPDATE CASCADE;
+  END IF;
+END $$;
diff --git a/packages/backend/server/schema.prisma b/packages/backend/server/schema.prisma
@@ -1,12 +1,13 @@
 generator client {
   provider        = "prisma-client-js"
   binaryTargets   = ["native", "debian-openssl-3.0.x", "linux-arm64-openssl-3.0.x"]
-  previewFeatures = ["metrics", "relationJoins", "nativeDistinct"]
+  previewFeatures = ["metrics", "relationJoins", "nativeDistinct", "postgresqlExtensions"]
 }
 
 datasource db {
-  provider = "postgresql"
-  url      = env("DATABASE_URL")
+  provider   = "postgresql"
+  url        = env("DATABASE_URL")
+  extensions = [pgvector(map: "vector")]
 }
 
 model User {
@@ -281,6 +282,8 @@ model Snapshot {
   // we need to clear all hanging updates and snapshots before enable the foreign key on workspaceId
   // workspace Workspace @relation(fields: [workspaceId], references: [id], onDelete: Cascade)
 
+  embedding AiWorkspaceEmbedding?
+
   @@id([workspaceId, id])
   @@index([workspaceId, updatedAt])
   @@map("snapshots")
@@ -426,11 +429,51 @@ model AiContext {
   createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(3)
   updatedAt DateTime @updatedAt @map("updated_at") @db.Timestamptz(3)
 
-  session AiSession @relation(fields: [sessionId], references: [id], onDelete: Cascade)
+  embeddings AiContextEmbedding[]
+  session    AiSession            @relation(fields: [sessionId], references: [id], onDelete: Cascade)
 
   @@map("ai_contexts")
 }
 
+model AiContextEmbedding {
+  id        String                     @id @default(uuid()) @db.VarChar
+  contextId String                     @map("context_id") @db.VarChar
+  fileId    String                     @map("file_id") @db.VarChar
+  // a file can be divided into multiple chunks and embedded separately.
+  chunk     Int                        @db.Integer
+  content   String                     @db.VarChar
+  embedding Unsupported("vector(512)")
+
+  createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(3)
+  updatedAt DateTime @updatedAt @map("updated_at") @db.Timestamptz(3)
+
+  context AiContext @relation(fields: [contextId], references: [id], onDelete: Cascade)
+
+  @@unique([contextId, fileId, chunk])
+  @@index([embedding], map: "ai_context_embeddings_idx")
+  @@map("ai_context_embeddings")
+}
+
+model AiWorkspaceEmbedding {
+  workspaceId String                     @map("workspace_id") @db.VarChar
+  docId       String                     @map("doc_id") @db.VarChar
+  // a doc can be divided into multiple chunks and embedded separately.
+  chunk       Int                        @db.Integer
+  content     String                     @db.VarChar
+  embedding   Unsupported("vector(512)")
+
+  createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(3)
+  updatedAt DateTime @updatedAt @map("updated_at") @db.Timestamptz(3)
+
+  // workspace level search not available for non-cloud workspaces
+  // so we can match this record with the snapshot one by one
+  snapshot Snapshot @relation(fields: [workspaceId, docId], references: [workspaceId, id], onDelete: Cascade)
+
+  @@id([workspaceId, docId])
+  @@index([embedding], map: "ai_workspace_embeddings_idx")
+  @@map("ai_workspace_embeddings")
+}
+
 model DataMigration {
   id         String    @id @default(uuid()) @db.VarChar
   name       String    @unique @db.VarChar

diff --git a/packages/backend/server/src/__tests__/__snapshots__/copilot.e2e.ts.md b/packages/backend/server/src/__tests__/__snapshots__/copilot.e2e.ts.md
@@ -43,3 +43,14 @@ Generated by [AVA](https://avajs.dev).
         id: 'docId1',
       },
     ]
+
+> should list context docs
+
+    [
+      {
+        blobId: 'fileId1',
+        chunkSize: 0,
+        name: 'sample.pdf',
+        status: 'processing',
+      },
+    ]
diff --git a/packages/backend/server/src/__tests__/__snapshots__/copilot.e2e.ts.snap b/packages/backend/server/src/__tests__/__snapshots__/copilot.e2e.ts.snap