Skip to content

Commit

Permalink
Fix present diminsions on vectorDBs to be inferred for providers who …
Browse files Browse the repository at this point in the history
…require it (#605)
  • Loading branch information
timothycarambat authored Jan 16, 2024
1 parent f5bb064 commit d0a3f1e
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 26 deletions.
1 change: 0 additions & 1 deletion server/utils/EmbeddingEngines/azureOpenAi/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ class AzureOpenAiEmbedder {
new AzureKeyCredential(process.env.AZURE_OPENAI_KEY)
);
this.openai = openai;
this.dimensions = 1536;

// Limit of how many strings we can process in a single pass to stay with resource or network limits
// https://learn.microsoft.com/en-us/azure/ai-services/openai/faq#i-am-trying-to-use-embeddings-and-received-the-error--invalidrequesterror--too-many-inputs--the-max-number-of-inputs-is-1---how-do-i-fix-this-:~:text=consisting%20of%20up%20to%2016%20inputs%20per%20API%20request
Expand Down
4 changes: 0 additions & 4 deletions server/utils/EmbeddingEngines/localAi/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@ class LocalAiEmbedder {
: {}),
});
this.openai = new OpenAIApi(config);
// We don't know this for user's set model so for vectorDB integrations that requires dimensionality
// in schema, we will throw an error.
// Applies to QDrant and Milvus.
this.dimensions = null;

// Limit of how many strings we can process in a single pass to stay with resource or network limits
this.maxConcurrentChunks = 50;
Expand Down
1 change: 0 additions & 1 deletion server/utils/EmbeddingEngines/native/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ class NativeEmbedder {
: path.resolve(__dirname, `../../../storage/models`)
);
this.modelPath = path.resolve(this.cacheDir, "Xenova", "all-MiniLM-L6-v2");
this.dimensions = 384;

// Limit of how many strings we can process in a single pass to stay with resource or network limits
this.maxConcurrentChunks = 25;
Expand Down
1 change: 0 additions & 1 deletion server/utils/EmbeddingEngines/openAi/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ class OpenAiEmbedder {
});
const openai = new OpenAIApi(config);
this.openai = openai;
this.dimensions = 1536;

// Limit of how many strings we can process in a single pass to stay with resource or network limits
this.maxConcurrentChunks = 500;
Expand Down
20 changes: 12 additions & 8 deletions server/utils/vectorDbProviders/milvus/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,15 @@ const Milvus = {
await client.dropCollection({ collection_name: namespace });
return true;
},
getOrCreateCollection: async function (client, namespace) {
// Milvus requires a dimension aspect for collection creation
// we pass this in from the first chunk to infer the dimensions like other
// providers do.
getOrCreateCollection: async function (client, namespace, dimensions = null) {
const isExists = await this.namespaceExists(client, namespace);
if (!isExists) {
const embedder = getEmbeddingEngineSelection();
if (!embedder.dimensions)
if (!dimensions)
throw new Error(
`Your embedder selection has unknown dimensions output. It should be defined when using ${this.name}. Open an issue on Github for support.`
`Milvus:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on Github for support.`
);

await client.createCollection({
Expand All @@ -104,7 +106,7 @@ const Milvus = {
name: "vector",
description: "vector",
data_type: DataType.FloatVector,
dim: embedder.dimensions,
dim: dimensions,
},
{
name: "metadata",
Expand All @@ -131,18 +133,19 @@ const Milvus = {
) {
const { DocumentVectors } = require("../../../models/vectors");
try {
let vectorDimension = null;
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;

console.log("Adding new vectorized document into namespace", namespace);
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
const { client } = await this.connect();
await this.getOrCreateCollection(client, namespace);

const { chunks } = cacheResult;
const documentVectors = [];
vectorDimension = chunks[0][0].values.length || null;

await this.getOrCreateCollection(client, namespace, vectorDimension);
for (const chunk of chunks) {
// Before sending to Pinecone and saving the records to our db
// we need to assign the id of each chunk that is stored in the cached file.
Expand Down Expand Up @@ -182,6 +185,7 @@ const Milvus = {

if (!!vectorValues && vectorValues.length > 0) {
for (const [i, vector] of vectorValues.entries()) {
if (!vectorDimension) vectorDimension = vector.length;
const vectorRecord = {
id: uuidv4(),
values: vector,
Expand All @@ -202,7 +206,7 @@ const Milvus = {
if (vectors.length > 0) {
const chunks = [];
const { client } = await this.connect();
await this.getOrCreateCollection(client, namespace);
await this.getOrCreateCollection(client, namespace, vectorDimension);

console.log("Inserting vectorized chunks into Milvus.");
for (const chunk of toChunks(vectors, 100)) {
Expand Down
34 changes: 23 additions & 11 deletions server/utils/vectorDbProviders/qdrant/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -108,19 +108,20 @@ const QDrant = {
await client.deleteCollection(namespace);
return true;
},
getOrCreateCollection: async function (client, namespace) {
// QDrant requires a dimension aspect for collection creation
// we pass this in from the first chunk to infer the dimensions like other
// providers do.
getOrCreateCollection: async function (client, namespace, dimensions = null) {
if (await this.namespaceExists(client, namespace)) {
return await client.getCollection(namespace);
}

const embedder = getEmbeddingEngineSelection();
if (!embedder.dimensions)
if (!dimensions)
throw new Error(
`Your embedder selection has unknown dimensions output. It should be defined when using ${this.name}. Open an issue on Github for support.`
`Qdrant:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on Github for support.`
);
await client.createCollection(namespace, {
vectors: {
size: embedder.dimensions,
size: dimensions,
distance: "Cosine",
},
});
Expand All @@ -133,22 +134,28 @@ const QDrant = {
) {
const { DocumentVectors } = require("../../../models/vectors");
try {
let vectorDimension = null;
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;

console.log("Adding new vectorized document into namespace", namespace);
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
const { client } = await this.connect();
const collection = await this.getOrCreateCollection(client, namespace);
const { chunks } = cacheResult;
const documentVectors = [];
vectorDimension = chunks[0][0].vector.length || null;

const collection = await this.getOrCreateCollection(
client,
namespace,
vectorDimension
);
if (!collection)
throw new Error("Failed to create new QDrant collection!", {
namespace,
});

const { chunks } = cacheResult;
const documentVectors = [];

for (const chunk of chunks) {
const submission = {
ids: [],
Expand Down Expand Up @@ -210,6 +217,7 @@ const QDrant = {

if (!!vectorValues && vectorValues.length > 0) {
for (const [i, vector] of vectorValues.entries()) {
if (!vectorDimension) vectorDimension = vector.length;
const vectorRecord = {
id: uuidv4(),
vector: vector,
Expand All @@ -233,7 +241,11 @@ const QDrant = {
}

const { client } = await this.connect();
const collection = await this.getOrCreateCollection(client, namespace);
const collection = await this.getOrCreateCollection(
client,
namespace,
vectorDimension
);
if (!collection)
throw new Error("Failed to create new QDrant collection!", {
namespace,
Expand Down

0 comments on commit d0a3f1e

Please sign in to comment.