Skip to content

Commit 7e36518

Browse files
committed
Implement nip50 fulltext searching
This adds support for nip50 fulltext searches. This allows you to use the nostrdb query interface for executing fulltext searches instead of the typical `ndb_text_search` api. The benefits of this include a standardized query interface that also further filters on other fields in the filter. Changelog-Added: Add nip50 search filters and queries
1 parent 2a4eb70 commit 7e36518

File tree

3 files changed

+113
-57
lines changed

3 files changed

+113
-57
lines changed

ndb.c

Lines changed: 12 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ static int usage()
1717
printf("usage: ndb [--skip-verification] [-d db_dir] <command>\n\n");
1818
printf("commands\n\n");
1919
printf(" stat\n");
20-
printf(" search [--oldest-first] [--limit 42] <fulltext query>\n");
21-
printf(" query [-k 42] [-k 1337] [-l 42] [-e abcdef...] [-a abcdef... -a bcdef...]\n");
20+
printf(" query [-k 42] [-k 1337] [--search term] [-l 42] [-e abcdef...] [-a abcdef... -a bcdef...]\n");
2221
printf(" profile <pubkey>\n");
2322
printf(" print-search-keys\n");
2423
printf(" print-kind-keys\n");
@@ -136,14 +135,11 @@ int main(int argc, char *argv[])
136135
long nanos;
137136
struct ndb_stat stat;
138137
struct ndb_txn txn;
139-
struct ndb_text_search_results results;
140-
struct ndb_text_search_result *result;
141138
const char *dir;
142139
unsigned char *data;
143140
size_t data_len;
144141
struct ndb_config config;
145142
struct timespec t1, t2;
146-
struct ndb_text_search_config search_config;
147143
unsigned char tmp_id[32];
148144

149145
// profiles
@@ -154,7 +150,6 @@ int main(int argc, char *argv[])
154150

155151
res = 0;
156152
ndb_default_config(&config);
157-
ndb_default_text_search_config(&search_config);
158153
ndb_config_set_mapsize(&config, 1024ULL * 1024ULL * 1024ULL * 1024ULL /* 1 TiB */);
159154

160155
if (argc < 2) {
@@ -184,38 +179,7 @@ int main(int argc, char *argv[])
184179
return 2;
185180
}
186181

187-
if (argc >= 3 && !strcmp(argv[1], "search")) {
188-
for (i = 0; i < 2; i++) {
189-
if (!strcmp(argv[2], "--oldest-first")) {
190-
ndb_text_search_config_set_order(&search_config, NDB_ORDER_ASCENDING);
191-
argv++;
192-
argc--;
193-
} else if (!strcmp(argv[2], "--limit") || !strcmp(argv[2], "-l")) {
194-
limit = atoi(argv[3]);
195-
ndb_text_search_config_set_limit(&search_config, limit);
196-
argv += 2;
197-
argc -= 2;
198-
}
199-
}
200-
201-
ndb_begin_query(ndb, &txn);
202-
clock_gettime(CLOCK_MONOTONIC, &t1);
203-
ndb_text_search(&txn, argv[2], &results, &search_config);
204-
clock_gettime(CLOCK_MONOTONIC, &t2);
205-
206-
nanos = (t2.tv_sec - t1.tv_sec) * (long)1e9 + (t2.tv_nsec - t1.tv_nsec);
207-
208-
fprintf(stderr, "%d results in %f ms\n", results.num_results, nanos/1000000.0);
209-
210-
// print results for now
211-
for (i = 0; i < results.num_results; i++) {
212-
result = &results.results[i];
213-
//fprintf(stderr, "[%02d] ", i+1);
214-
ndb_print_text_search_result(&txn, result);
215-
}
216-
217-
ndb_end_query(&txn);
218-
} else if (argc == 2 && !strcmp(argv[1], "stat")) {
182+
if (argc == 2 && !strcmp(argv[1], "stat")) {
219183
if (!ndb_stat(ndb, &stat)) {
220184
res = 3;
221185
goto cleanup;
@@ -282,6 +246,16 @@ int main(int argc, char *argv[])
282246
ndb_filter_end_field(f);
283247
argv += 2;
284248
argc -= 2;
249+
} else if (!strcmp(argv[0], "--search") || !strcmp(argv[0], "-S")) {
250+
if (current_field) {
251+
ndb_filter_end_field(f);
252+
current_field = 0;
253+
}
254+
ndb_filter_start_field(f, NDB_FILTER_SEARCH);
255+
ndb_filter_add_str_element(f, argv[1]);
256+
ndb_filter_end_field(f);
257+
argv += 2;
258+
argc -= 2;
285259
} else if (!strcmp(argv[0], "-e")) {
286260
if (current_field != 'e') {
287261
if (!ndb_filter_start_tag_field(f, 'e')) {

src/nostrdb.c

Lines changed: 97 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ enum ndb_query_plan {
230230
NDB_PLAN_AUTHOR_KINDS,
231231
NDB_PLAN_CREATED,
232232
NDB_PLAN_TAGS,
233+
NDB_PLAN_SEARCH,
233234
};
234235

235236
// A id + u64 + timestamp
@@ -334,27 +335,27 @@ static int ndb_make_noted_text_search_key(unsigned char *buf, int bufsize,
334335

335336
static int ndb_make_text_search_key_low(unsigned char *buf, int bufsize,
336337
int wordlen, const char *word,
338+
uint64_t since,
337339
int *keysize)
338340
{
339-
uint64_t timestamp, note_id;
340-
timestamp = 0;
341+
uint64_t note_id;
341342
note_id = 0;
342343
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
343-
timestamp, note_id, keysize);
344+
since, note_id, keysize);
344345
}
345346

346347
static int ndb_make_text_search_key_high(unsigned char *buf, int bufsize,
347348
int wordlen, const char *word,
349+
uint64_t until,
348350
int *keysize)
349351
{
350-
uint64_t timestamp, note_id;
351-
timestamp = INT32_MAX;
352+
uint64_t note_id;
352353
note_id = INT32_MAX;
353354
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
354-
timestamp, note_id, keysize);
355+
until, note_id, keysize);
355356
}
356357

357-
typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, int *keysize);
358+
typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, uint64_t timestamp, int *keysize);
358359

359360
/** From LMDB: Compare two items lexically */
360361
static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) {
@@ -3044,6 +3045,43 @@ static int query_is_full(struct ndb_query_results *results, int limit)
30443045
return cursor_count(&results->cur, sizeof(struct ndb_query_result)) >= limit;
30453046
}
30463047

3048+
static int ndb_query_plan_execute_search(struct ndb_txn *txn,
3049+
struct ndb_filter *filter,
3050+
struct ndb_query_results *results,
3051+
int limit)
3052+
{
3053+
const char *search;
3054+
int i;
3055+
struct ndb_text_search_results text_results;
3056+
struct ndb_text_search_result *text_result;
3057+
struct ndb_text_search_config config;
3058+
struct ndb_query_result result;
3059+
3060+
ndb_default_text_search_config(&config);
3061+
3062+
if (!(search = ndb_filter_find_search(filter)))
3063+
return 0;
3064+
3065+
if (!ndb_text_search_with(txn, search, &text_results, &config, filter))
3066+
return 0;
3067+
3068+
for (i = 0; i < text_results.num_results; i++) {
3069+
if (query_is_full(results, limit))
3070+
break;
3071+
3072+
text_result = &text_results.results[i];
3073+
3074+
result.note = text_result->note;
3075+
result.note_size = text_result->note_size;
3076+
result.note_id = text_result->key.note_id;
3077+
3078+
if (!push_query_result(results, &result))
3079+
break;
3080+
}
3081+
3082+
return 1;
3083+
}
3084+
30473085
static int ndb_query_plan_execute_ids(struct ndb_txn *txn,
30483086
struct ndb_filter *filter,
30493087
struct ndb_query_results *results,
@@ -3456,15 +3494,18 @@ static int ndb_query_plan_execute_kinds(struct ndb_txn *txn,
34563494

34573495
static enum ndb_query_plan ndb_filter_plan(struct ndb_filter *filter)
34583496
{
3459-
struct ndb_filter_elements *ids, *kinds, *authors, *tags;
3497+
struct ndb_filter_elements *ids, *kinds, *authors, *tags, *search;
34603498

34613499
ids = ndb_filter_find_elements(filter, NDB_FILTER_IDS);
3500+
search = ndb_filter_find_elements(filter, NDB_FILTER_SEARCH);
34623501
kinds = ndb_filter_find_elements(filter, NDB_FILTER_KINDS);
34633502
authors = ndb_filter_find_elements(filter, NDB_FILTER_AUTHORS);
34643503
tags = ndb_filter_find_elements(filter, NDB_FILTER_TAGS);
34653504

34663505
// this is rougly similar to the heuristic in strfry's dbscan
3467-
if (ids) {
3506+
if (search) {
3507+
return NDB_PLAN_SEARCH;
3508+
} else if (ids) {
34683509
return NDB_PLAN_IDS;
34693510
} else if (kinds && authors && authors->count <= 10) {
34703511
return NDB_PLAN_AUTHOR_KINDS;
@@ -3483,6 +3524,7 @@ static const char *ndb_query_plan_name(int plan_id)
34833524
{
34843525
switch (plan_id) {
34853526
case NDB_PLAN_IDS: return "ids";
3527+
case NDB_PLAN_SEARCH: return "search";
34863528
case NDB_PLAN_KINDS: return "kinds";
34873529
case NDB_PLAN_TAGS: return "tags";
34883530
case NDB_PLAN_CREATED: return "created";
@@ -3518,6 +3560,11 @@ static int ndb_query_filter(struct ndb_txn *txn, struct ndb_filter *filter,
35183560
return 0;
35193561
break;
35203562

3563+
case NDB_PLAN_SEARCH:
3564+
if (!ndb_query_plan_execute_search(txn, filter, &results, limit))
3565+
return 0;
3566+
break;
3567+
35213568
// We have just kinds, just scan the kind index
35223569
case NDB_PLAN_KINDS:
35233570
if (!ndb_query_plan_execute_kinds(txn, filter, &results, limit))
@@ -4031,24 +4078,44 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
40314078
struct ndb_word *search_word;
40324079
struct ndb_note *note;
40334080
struct cursor cur;
4081+
uint64_t since, until, timestamp_op, *pint, note_size;
40344082
ndb_text_search_key_order_fn key_order_fn;
40354083
MDB_dbi text_db;
40364084
MDB_cursor *cursor;
40374085
MDB_val k, v;
40384086
int i, j, keysize, saved_size, limit;
40394087
MDB_cursor_op op, order_op;
40404088

4089+
note_size = 0;
4090+
note = 0;
40414091
saved = NULL;
40424092
ndb_text_search_results_init(results);
40434093
ndb_search_words_init(&search_words);
40444094

4045-
// search config
4095+
until = UINT64_MAX;
4096+
since = 0;
40464097
limit = MAX_TEXT_SEARCH_RESULTS;
4098+
4099+
// until, since from filter
4100+
if (filter != NULL) {
4101+
if ((pint = ndb_filter_get_int(filter, NDB_FILTER_UNTIL)))
4102+
until = *pint;
4103+
4104+
if ((pint = ndb_filter_get_int(filter, NDB_FILTER_SINCE)))
4105+
since = *pint;
4106+
4107+
if ((pint = ndb_filter_get_int(filter, NDB_FILTER_LIMIT)))
4108+
limit = *pint;
4109+
}
4110+
40474111
order_op = MDB_PREV;
40484112
key_order_fn = ndb_make_text_search_key_high;
4113+
timestamp_op = until;
40494114
if (config) {
40504115
if (config->order == NDB_ORDER_ASCENDING) {
40514116
order_op = MDB_NEXT;
4117+
// set the min timestamp value to since when ascending
4118+
timestamp_op = since;
40524119
key_order_fn = ndb_make_text_search_key_low;
40534120
}
40544121
limit = min(limit, config->limit);
@@ -4067,9 +4134,11 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
40674134
return 0;
40684135
}
40694136

4070-
// TODO: sort words from largest to smallest. This should complete the
4071-
// query quicker because the larger words are likely to have fewer
4072-
// entries in the search index.
4137+
// This should complete the query quicker because the larger words are
4138+
// likely to have fewer entries in the search index. This is not always
4139+
// true. Words with higher frequency (like bitcoin on nostr in 2024)
4140+
// may be slower. TODO: Skip word recursion by leveraging a minimal
4141+
// perfect hashmap of parsed words on a note
40734142
sort_largest_to_smallest(&search_words);
40744143

40754144
// for each word, we recursively find all of the submatches
@@ -4099,7 +4168,9 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
40994168
// match
41004169
if (!key_order_fn(buffer, sizeof(buffer),
41014170
search_words.words[0].word_len,
4102-
search_words.words[0].word, &keysize))
4171+
search_words.words[0].word,
4172+
timestamp_op,
4173+
&keysize))
41034174
{
41044175
// word is too big to fit in 1024-sized key
41054176
continue;
@@ -4172,10 +4243,12 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
41724243

41734244
// save the first key match, since we will continue from
41744245
// this on the next root word result
4175-
if (j == 0 && !saved) {
4176-
memcpy(saved_buf, k.mv_data, k.mv_size);
4177-
saved = saved_buf;
4178-
saved_size = k.mv_size;
4246+
if (j == 0) {
4247+
if (!saved) {
4248+
memcpy(saved_buf, k.mv_data, k.mv_size);
4249+
saved = saved_buf;
4250+
saved_size = k.mv_size;
4251+
}
41794252

41804253
// since we will be trying to match the same
41814254
// note_id on all subsequent word matches,
@@ -4185,15 +4258,20 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
41854258
// remaining word queries
41864259
if (filter) {
41874260
if ((note = ndb_get_note_by_key(txn,
4188-
result->key.note_id, NULL)))
4261+
result->key.note_id,
4262+
&note_size)))
41894263
{
41904264
if (!ndb_filter_matches(filter, note)) {
41914265
break;
41924266
}
4267+
result->note = note;
4268+
result->note_size = note_size;
41934269
}
41944270
}
41954271
}
41964272

4273+
result->note = note;
4274+
result->note_size = note_size;
41974275
last_candidate = *result;
41984276
last_result = &last_candidate;
41994277
}

src/nostrdb.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,10 @@ struct ndb_text_search_key
314314
struct ndb_text_search_result {
315315
struct ndb_text_search_key key;
316316
int prefix_chars;
317+
318+
// This is only set if we passed a filter for nip50 searches
319+
struct ndb_note *note;
320+
uint64_t note_size;
317321
};
318322

319323
struct ndb_text_search_results {

0 commit comments

Comments
 (0)