Skip to content

Commit 445dc12

Browse files
authored
feat(ingester): support metadata updates only (#73)
1 parent 4d7bd49 commit 445dc12

File tree

7 files changed

+6210
-6444
lines changed

7 files changed

+6210
-6444
lines changed

ingesters/__tests__/vectorStoreUtils.test.ts

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ describe('findChunksToUpdateAndRemove', () => {
8787

8888
const result = findChunksToUpdateAndRemove(freshChunks, storedChunkHashes);
8989

90-
expect(result.chunksToUpdate).toEqual([
90+
expect(result.contentChanged).toEqual([
9191
{
9292
metadata: {
9393
name: '2',
@@ -113,6 +113,7 @@ describe('findChunksToUpdateAndRemove', () => {
113113
pageContent: 'Some Content 3',
114114
},
115115
]);
116+
expect(result.metadataOnlyChanged).toEqual([]);
116117
expect(result.chunksToRemove).toEqual(['3']);
117118
});
118119

@@ -173,14 +174,15 @@ describe('findChunksToUpdateAndRemove', () => {
173174

174175
const result = findChunksToUpdateAndRemove(freshChunks, storedChunkHashes);
175176

176-
expect(result.chunksToUpdate).toEqual([]);
177+
expect(result.contentChanged).toEqual([]);
178+
expect(result.metadataOnlyChanged).toEqual([]);
177179
expect(result.chunksToRemove).toEqual([]);
178180
});
179181

180182
it('should handle empty inputs correctly', () => {
181183
const result = findChunksToUpdateAndRemove([], []);
182-
183-
expect(result.chunksToUpdate).toEqual([]);
184+
expect(result.contentChanged).toEqual([]);
185+
expect(result.metadataOnlyChanged).toEqual([]);
184186
expect(result.chunksToRemove).toEqual([]);
185187
});
186188

@@ -217,8 +219,8 @@ describe('findChunksToUpdateAndRemove', () => {
217219

218220
const result = findChunksToUpdateAndRemove(freshChunks, storedChunkHashes);
219221

220-
// Should update because metadata changed (sourceLink and title)
221-
expect(result.chunksToUpdate).toEqual([
222+
// Should update metadata-only because metadata changed (sourceLink and title)
223+
expect(result.metadataOnlyChanged).toEqual([
222224
{
223225
metadata: {
224226
name: '1',
@@ -232,6 +234,7 @@ describe('findChunksToUpdateAndRemove', () => {
232234
pageContent: 'Some Content 1',
233235
},
234236
]);
237+
expect(result.contentChanged).toEqual([]);
235238
expect(result.chunksToRemove).toEqual([]);
236239
});
237240
});

ingesters/src/db/postgresVectorStore.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,53 @@ export class VectorStore {
357357
}
358358
}
359359

360+
/**
361+
* Update only the metadata (and source column for consistency) for existing documents.
362+
* Does NOT modify content, embedding, or contentHash.
363+
*/
364+
async updateDocumentsMetadata(
365+
documents: DocumentInterface[],
366+
options?: { ids?: string[] },
367+
): Promise<void> {
368+
if (documents.length === 0) return;
369+
370+
logger.info(`Updating metadata for ${documents.length} documents`);
371+
372+
try {
373+
const client = await this.pool.connect();
374+
try {
375+
await client.query('BEGIN');
376+
377+
const updates = documents.map((doc, i) => {
378+
const uniqueId = options?.ids?.[i] || doc.metadata.uniqueId || null;
379+
const source = doc.metadata.source || null;
380+
const query = `
381+
UPDATE ${this.tableName}
382+
SET metadata = $2,
383+
source = $3
384+
WHERE uniqueId = $1
385+
`;
386+
return client.query(query, [
387+
uniqueId,
388+
JSON.stringify(doc.metadata),
389+
source,
390+
]);
391+
});
392+
393+
await Promise.all(updates);
394+
await client.query('COMMIT');
395+
} catch (error) {
396+
await client.query('ROLLBACK');
397+
throw error;
398+
} finally {
399+
client.release();
400+
}
401+
} catch (error) {
402+
logger.error('Error updating document metadata:', error);
403+
throw DatabaseError.handlePgError(error as PgError);
404+
}
405+
}
406+
360407
/**
361408
* Find a specific book chunk by name
362409
* @param name - Name of the book chunk

ingesters/src/ingesters/CairoBookIngester.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export class CairoBookIngester extends MarkdownIngester {
3333
chunkOverlap: 512,
3434
baseUrl: 'https://book.cairo-lang.org',
3535
urlSuffix: '.html',
36-
useUrlMapping: false,
36+
useUrlMapping: true,
3737
};
3838

3939
super(config, DocumentSource.CAIRO_BOOK);
@@ -71,7 +71,7 @@ export class CairoBookIngester extends MarkdownIngester {
7171
maxChars: 2048,
7272
minChars: 500,
7373
overlap: 256,
74-
headerLevels: [1, 2], // Split on H1 and H2 headers
74+
headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
7575
preserveCodeBlocks: true,
7676
idPrefix: 'cairo-book',
7777
trim: true,
@@ -97,7 +97,7 @@ export class CairoBookIngester extends MarkdownIngester {
9797
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
9898
contentHash: contentHash,
9999
uniqueId: chunk.meta.uniqueId,
100-
sourceLink: this.config.baseUrl,
100+
sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
101101
source: this.source,
102102
},
103103
});

ingesters/src/ingesters/CoreLibDocsIngester.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ export class CoreLibDocsIngester extends MarkdownIngester {
7575
maxChars: 2048,
7676
minChars: 500,
7777
overlap: 256,
78-
headerLevels: [1, 2], // Split on H1 and H2 headers
78+
headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
7979
preserveCodeBlocks: true,
8080
idPrefix: 'corelib',
8181
trim: true,
@@ -101,7 +101,7 @@ export class CoreLibDocsIngester extends MarkdownIngester {
101101
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
102102
contentHash: contentHash,
103103
uniqueId: chunk.meta.uniqueId,
104-
sourceLink: this.config.baseUrl,
104+
sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
105105
source: this.source,
106106
},
107107
});

ingesters/src/ingesters/OpenZeppelinDocsIngester.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ export class OpenZeppelinDocsIngester extends MarkdownIngester {
7575
maxChars: 2048,
7676
minChars: 500,
7777
overlap: 256,
78-
headerLevels: [1, 2], // Split on H1 and H2 headers
78+
headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
7979
preserveCodeBlocks: true,
8080
idPrefix: 'openzeppelin-docs',
8181
trim: true,
@@ -101,7 +101,7 @@ export class OpenZeppelinDocsIngester extends MarkdownIngester {
101101
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
102102
contentHash: contentHash,
103103
uniqueId: chunk.meta.uniqueId,
104-
sourceLink: this.config.baseUrl,
104+
sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
105105
source: this.source,
106106
},
107107
});

ingesters/src/utils/vectorStoreUtils.ts

Lines changed: 56 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ export function findChunksToUpdateAndRemove(
2323
metadata: BookChunk;
2424
}[],
2525
): {
26-
chunksToUpdate: Document<BookChunk>[];
26+
contentChanged: Document<BookChunk>[];
27+
metadataOnlyChanged: Document<BookChunk>[];
2728
chunksToRemove: string[];
2829
} {
2930
const storedDataMap = new Map(
@@ -33,31 +34,51 @@ export function findChunksToUpdateAndRemove(
3334
freshChunks.map((chunk) => [chunk.metadata.uniqueId, chunk]),
3435
);
3536

36-
// Find chunks that need to be updated (content or metadata has changed)
37-
const chunksToUpdate = freshChunks.filter((chunk) => {
38-
const storedMetadata = storedDataMap.get(chunk.metadata.uniqueId);
39-
if (!storedMetadata) {
40-
// New chunk that doesn't exist in storage
41-
return true;
37+
const contentChanged: Document<BookChunk>[] = [];
38+
const metadataOnlyChanged: Document<BookChunk>[] = [];
39+
40+
for (const fresh of freshChunks) {
41+
const stored = storedDataMap.get(fresh.metadata.uniqueId);
42+
if (!stored) {
43+
// New doc: requires full insert + embedding
44+
contentChanged.push(fresh);
45+
continue;
46+
}
47+
48+
const storedHash = stored.contentHash;
49+
const freshHash = fresh.metadata.contentHash;
50+
if (storedHash !== freshHash) {
51+
// Content changed: re-embed and upsert fully
52+
contentChanged.push(fresh);
53+
continue;
4254
}
43-
// Update if content hash changed or any metadata field changed
44-
for (const key in chunk.metadata) {
45-
if (
46-
storedMetadata[key as keyof BookChunk] !==
47-
chunk.metadata[key as keyof BookChunk]
48-
) {
49-
return true;
55+
56+
// Content same, check if any metadata field differs
57+
const keys = new Set<keyof BookChunk>([
58+
...(Object.keys(stored) as (keyof BookChunk)[]),
59+
...(Object.keys(fresh.metadata) as (keyof BookChunk)[]),
60+
]);
61+
62+
let metaDiffers = false;
63+
for (const key of keys) {
64+
// Ignore contentHash here since we already know it's equal
65+
if (key === 'contentHash') continue;
66+
if (stored[key] !== fresh.metadata[key]) {
67+
metaDiffers = true;
68+
break;
5069
}
5170
}
52-
return false;
53-
});
71+
if (metaDiffers) {
72+
metadataOnlyChanged.push(fresh);
73+
}
74+
}
5475

5576
// Find chunks that need to be removed (no longer exist in fresh chunks)
5677
const chunksToRemove = storedChunkHashes
5778
.filter((stored) => !freshChunksMap.has(stored.uniqueId))
5879
.map((stored) => stored.uniqueId);
5980

60-
return { chunksToUpdate, chunksToRemove };
81+
return { contentChanged, metadataOnlyChanged, chunksToRemove };
6182
}
6283

6384
/**
@@ -80,16 +101,18 @@ export async function updateVectorStore(
80101
await vectorStore.getStoredBookPagesMetadata(source);
81102

82103
// Find chunks to update and remove
83-
const { chunksToUpdate, chunksToRemove } = findChunksToUpdateAndRemove(
84-
chunks,
85-
storedChunkHashes,
86-
);
104+
const { contentChanged, metadataOnlyChanged, chunksToRemove } =
105+
findChunksToUpdateAndRemove(chunks, storedChunkHashes);
87106

88107
logger.info(
89-
`Found ${storedChunkHashes.length} stored chunks for source: ${source}. ${chunksToUpdate.length} chunks to update and ${chunksToRemove.length} chunks to remove`,
108+
`Found ${storedChunkHashes.length} stored chunks for source: ${source}. ${contentChanged.length} content changes, ${metadataOnlyChanged.length} metadata-only changes, and ${chunksToRemove.length} removals`,
90109
);
91110

92-
if (chunksToUpdate.length === 0 && chunksToRemove.length === 0) {
111+
if (
112+
contentChanged.length === 0 &&
113+
metadataOnlyChanged.length === 0 &&
114+
chunksToRemove.length === 0
115+
) {
93116
logger.info('No changes to update or remove');
94117
return;
95118
}
@@ -129,13 +152,19 @@ export async function updateVectorStore(
129152
}
130153

131154
// Update chunks that have changed
132-
if (chunksToUpdate.length > 0) {
133-
await vectorStore.addDocuments(chunksToUpdate, {
134-
ids: chunksToUpdate.map((chunk) => chunk.metadata.uniqueId),
155+
if (contentChanged.length > 0) {
156+
await vectorStore.addDocuments(contentChanged, {
157+
ids: contentChanged.map((chunk) => chunk.metadata.uniqueId),
158+
});
159+
}
160+
161+
if (metadataOnlyChanged.length > 0) {
162+
await vectorStore.updateDocumentsMetadata(metadataOnlyChanged, {
163+
ids: metadataOnlyChanged.map((chunk) => chunk.metadata.uniqueId),
135164
});
136165
}
137166

138167
logger.info(
139-
`Updated ${chunksToUpdate.length} chunks and removed ${chunksToRemove.length} chunks for source: ${source}.`,
168+
`Updated ${contentChanged.length} content chunks, ${metadataOnlyChanged.length} metadata-only chunks, and removed ${chunksToRemove.length} chunks for source: ${source}.`,
140169
);
141170
}

0 commit comments

Comments
 (0)