5
5
6
6
import logging
7
7
import time
8
- from contextlib import contextmanager
8
+ from contextlib import contextmanager , nullcontext
9
9
from datetime import datetime , timedelta , timezone
10
10
from functools import wraps
11
11
from typing import Callable , Generator
24
24
from rest_framework .request import Request
25
25
from common .djangoapps .student .role_helpers import get_course_roles
26
26
from openedx .core .djangoapps .content .course_overviews .models import CourseOverview
27
- from openedx .core .djangoapps .content .search .models import get_access_ids_for_request
27
+ from openedx .core .djangoapps .content .search .models import get_access_ids_for_request , IncrementalIndexCompleted
28
+ from openedx .core .djangoapps .content .search .index_config import (
29
+ INDEX_DISTINCT_ATTRIBUTE ,
30
+ INDEX_FILTERABLE_ATTRIBUTES ,
31
+ INDEX_SEARCHABLE_ATTRIBUTES ,
32
+ INDEX_SORTABLE_ATTRIBUTES ,
33
+ INDEX_RANKING_RULES ,
34
+ )
28
35
from openedx .core .djangoapps .content_libraries import api as lib_api
29
36
from xmodule .modulestore .django import modulestore
30
37
@@ -217,6 +224,42 @@ def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generat
217
224
_wait_for_meili_task (client .delete_index (temp_index_name ))
218
225
219
226
227
+ def _index_is_empty (index_name : str ) -> bool :
228
+ """
229
+ Check if an index is empty
230
+
231
+ Args:
232
+ index_name (str): The name of the index to check
233
+ """
234
+ client = _get_meilisearch_client ()
235
+ index = client .get_index (index_name )
236
+ return index .get_stats ().number_of_documents == 0
237
+
238
+
239
+ def _configure_index (index_name ):
240
+ """
241
+ Configure the index. The following index settings are best changed on an empty index.
242
+ Changing them on a populated index will "re-index all documents in the index", which can take some time.
243
+
244
+ Args:
245
+ index_name (str): The name of the index to configure
246
+ """
247
+ client = _get_meilisearch_client ()
248
+
249
+ # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique):
250
+ client .index (index_name ).update_distinct_attribute (INDEX_DISTINCT_ATTRIBUTE )
251
+ # Mark which attributes can be used for filtering/faceted search:
252
+ client .index (index_name ).update_filterable_attributes (INDEX_FILTERABLE_ATTRIBUTES )
253
+ # Mark which attributes are used for keyword search, in order of importance:
254
+ client .index (index_name ).update_searchable_attributes (INDEX_SEARCHABLE_ATTRIBUTES )
255
+ # Mark which attributes can be used for sorting search results:
256
+ client .index (index_name ).update_sortable_attributes (INDEX_SORTABLE_ATTRIBUTES )
257
+
258
+ # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
259
+ # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy
260
+ client .index (index_name ).update_ranking_rules (INDEX_RANKING_RULES )
261
+
262
+
220
263
def _recurse_children (block , fn , status_cb : Callable [[str ], None ] | None = None ) -> None :
221
264
"""
222
265
Recurse the children of an XBlock and call the given function for each
@@ -279,8 +322,75 @@ def is_meilisearch_enabled() -> bool:
279
322
return False
280
323
281
324
282
- # pylint: disable=too-many-statements
283
- def rebuild_index (status_cb : Callable [[str ], None ] | None = None ) -> None :
325
+ def reset_index (status_cb : Callable [[str ], None ] | None = None ) -> None :
326
+ """
327
+ Reset the Meilisearch index, deleting all documents and reconfiguring it
328
+ """
329
+ if status_cb is None :
330
+ status_cb = log .info
331
+
332
+ status_cb ("Creating new empty index..." )
333
+ with _using_temp_index (status_cb ) as temp_index_name :
334
+ _configure_index (temp_index_name )
335
+ status_cb ("Index recreated!" )
336
+ status_cb ("Index reset complete." )
337
+
338
+
339
+ def _is_index_configured (index_name : str ) -> bool :
340
+ """
341
+ Check if an index is completely configured
342
+
343
+ Args:
344
+ index_name (str): The name of the index to check
345
+ """
346
+ client = _get_meilisearch_client ()
347
+ index = client .get_index (index_name )
348
+ index_settings = index .get_settings ()
349
+ for k , v in (
350
+ ("distinctAttribute" , INDEX_DISTINCT_ATTRIBUTE ),
351
+ ("filterableAttributes" , INDEX_FILTERABLE_ATTRIBUTES ),
352
+ ("searchableAttributes" , INDEX_SEARCHABLE_ATTRIBUTES ),
353
+ ("sortableAttributes" , INDEX_SORTABLE_ATTRIBUTES ),
354
+ ("rankingRules" , INDEX_RANKING_RULES ),
355
+ ):
356
+ setting = index_settings .get (k , [])
357
+ if isinstance (v , list ):
358
+ v = set (v )
359
+ setting = set (setting )
360
+ if setting != v :
361
+ return False
362
+ return True
363
+
364
+
365
+ def init_index (status_cb : Callable [[str ], None ] | None = None , warn_cb : Callable [[str ], None ] | None = None ) -> None :
366
+ """
367
+ Initialize the Meilisearch index, creating it and configuring it if it doesn't exist
368
+ """
369
+ if status_cb is None :
370
+ status_cb = log .info
371
+ if warn_cb is None :
372
+ warn_cb = log .warning
373
+
374
+ if _index_exists (STUDIO_INDEX_NAME ):
375
+ if _index_is_empty (STUDIO_INDEX_NAME ):
376
+ warn_cb (
377
+ "The studio search index is empty. Please run ./manage.py cms reindex_studio"
378
+ " --experimental [--incremental]"
379
+ )
380
+ return
381
+ if not _is_index_configured (STUDIO_INDEX_NAME ):
382
+ warn_cb (
383
+ "A rebuild of the index is required. Please run ./manage.py cms reindex_studio"
384
+ " --experimental [--incremental]"
385
+ )
386
+ return
387
+ status_cb ("Index already exists and is configured." )
388
+ return
389
+
390
+ reset_index (status_cb )
391
+
392
+
393
+ def rebuild_index (status_cb : Callable [[str ], None ] | None = None , incremental = False ) -> None : # lint-amnesty, pylint: disable=too-many-statements
284
394
"""
285
395
Rebuild the Meilisearch index from scratch
286
396
"""
@@ -292,96 +402,40 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
292
402
293
403
# Get the lists of libraries
294
404
status_cb ("Counting libraries..." )
295
- lib_keys = [lib .library_key for lib in lib_api .ContentLibrary .objects .select_related ('org' ).only ('org' , 'slug' )]
405
+ keys_indexed = []
406
+ if incremental :
407
+ keys_indexed = list (IncrementalIndexCompleted .objects .values_list ("context_key" , flat = True ))
408
+ lib_keys = [
409
+ lib .library_key
410
+ for lib in lib_api .ContentLibrary .objects .select_related ("org" ).only ("org" , "slug" ).order_by ("-id" )
411
+ if lib .library_key not in keys_indexed
412
+ ]
296
413
num_libraries = len (lib_keys )
297
414
298
415
# Get the list of courses
299
416
status_cb ("Counting courses..." )
300
417
num_courses = CourseOverview .objects .count ()
301
418
302
419
# Some counters so we can track our progress as indexing progresses:
303
- num_contexts = num_courses + num_libraries
304
- num_contexts_done = 0 # How many courses/libraries we've indexed
420
+ num_libs_skipped = len (keys_indexed )
421
+ num_contexts = num_courses + num_libraries + num_libs_skipped
422
+ num_contexts_done = 0 + num_libs_skipped # How many courses/libraries we've indexed
305
423
num_blocks_done = 0 # How many individual components/XBlocks we've indexed
306
424
307
425
status_cb (f"Found { num_courses } courses, { num_libraries } libraries." )
308
- with _using_temp_index (status_cb ) as temp_index_name :
426
+ with _using_temp_index (status_cb ) if not incremental else nullcontext ( STUDIO_INDEX_NAME ) as index_name :
309
427
############## Configure the index ##############
310
428
311
- # The following index settings are best changed on an empty index.
312
- # Changing them on a populated index will "re-index all documents in the index, which can take some time"
429
+ # The index settings are best changed on an empty index.
430
+ # Changing them on a populated index will "re-index all documents in the index" , which can take some time
313
431
# and use more RAM. Instead, we configure an empty index then populate it one course/library at a time.
314
-
315
- # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique):
316
- client .index (temp_index_name ).update_distinct_attribute (Fields .usage_key )
317
- # Mark which attributes can be used for filtering/faceted search:
318
- client .index (temp_index_name ).update_filterable_attributes ([
319
- # Get specific block/collection using combination of block_id and context_key
320
- Fields .block_id ,
321
- Fields .block_type ,
322
- Fields .context_key ,
323
- Fields .usage_key ,
324
- Fields .org ,
325
- Fields .tags ,
326
- Fields .tags + "." + Fields .tags_taxonomy ,
327
- Fields .tags + "." + Fields .tags_level0 ,
328
- Fields .tags + "." + Fields .tags_level1 ,
329
- Fields .tags + "." + Fields .tags_level2 ,
330
- Fields .tags + "." + Fields .tags_level3 ,
331
- Fields .collections ,
332
- Fields .collections + "." + Fields .collections_display_name ,
333
- Fields .collections + "." + Fields .collections_key ,
334
- Fields .type ,
335
- Fields .access_id ,
336
- Fields .last_published ,
337
- Fields .content + "." + Fields .problem_types ,
338
- ])
339
- # Mark which attributes are used for keyword search, in order of importance:
340
- client .index (temp_index_name ).update_searchable_attributes ([
341
- # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields.
342
- Fields .display_name ,
343
- Fields .block_id ,
344
- Fields .content ,
345
- Fields .description ,
346
- Fields .tags ,
347
- Fields .collections ,
348
- # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
349
- # are searchable only if at least one document in the index has a value. If we didn't list them here and,
350
- # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
351
- # these sub-fields: "Attribute `tags.level3` is not searchable."
352
- Fields .tags + "." + Fields .tags_taxonomy ,
353
- Fields .tags + "." + Fields .tags_level0 ,
354
- Fields .tags + "." + Fields .tags_level1 ,
355
- Fields .tags + "." + Fields .tags_level2 ,
356
- Fields .tags + "." + Fields .tags_level3 ,
357
- Fields .collections + "." + Fields .collections_display_name ,
358
- Fields .collections + "." + Fields .collections_key ,
359
- Fields .published + "." + Fields .display_name ,
360
- Fields .published + "." + Fields .published_description ,
361
- ])
362
- # Mark which attributes can be used for sorting search results:
363
- client .index (temp_index_name ).update_sortable_attributes ([
364
- Fields .display_name ,
365
- Fields .created ,
366
- Fields .modified ,
367
- Fields .last_published ,
368
- ])
369
-
370
- # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
371
- # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy
372
- client .index (temp_index_name ).update_ranking_rules ([
373
- "sort" ,
374
- "words" ,
375
- "typo" ,
376
- "proximity" ,
377
- "attribute" ,
378
- "exactness" ,
379
- ])
432
+ if not incremental :
433
+ _configure_index (index_name )
380
434
381
435
############## Libraries ##############
382
436
status_cb ("Indexing libraries..." )
383
437
384
- def index_library (lib_key : str ) -> list :
438
+ def index_library (lib_key : LibraryLocatorV2 ) -> list :
385
439
docs = []
386
440
for component in lib_api .get_library_components (lib_key ):
387
441
try :
@@ -396,7 +450,7 @@ def index_library(lib_key: str) -> list:
396
450
if docs :
397
451
try :
398
452
# Add all the docs in this library at once (usually faster than adding one at a time):
399
- _wait_for_meili_task (client .index (temp_index_name ).add_documents (docs ))
453
+ _wait_for_meili_task (client .index (index_name ).add_documents (docs ))
400
454
except (TypeError , KeyError , MeilisearchError ) as err :
401
455
status_cb (f"Error indexing library { lib_key } : { err } " )
402
456
return docs
@@ -416,7 +470,7 @@ def index_collection_batch(batch, num_done, library_key) -> int:
416
470
if docs :
417
471
try :
418
472
# Add docs in batch of 100 at once (usually faster than adding one at a time):
419
- _wait_for_meili_task (client .index (temp_index_name ).add_documents (docs ))
473
+ _wait_for_meili_task (client .index (index_name ).add_documents (docs ))
420
474
except (TypeError , KeyError , MeilisearchError ) as err :
421
475
status_cb (f"Error indexing collection batch { p } : { err } " )
422
476
return num_done
@@ -439,6 +493,8 @@ def index_collection_batch(batch, num_done, library_key) -> int:
439
493
num_collections_done ,
440
494
lib_key ,
441
495
)
496
+ if incremental :
497
+ IncrementalIndexCompleted .objects .get_or_create (context_key = lib_key )
442
498
status_cb (f"{ num_collections_done } /{ num_collections } collections indexed for library { lib_key } " )
443
499
444
500
num_contexts_done += 1
@@ -464,7 +520,7 @@ def add_with_children(block):
464
520
465
521
if docs :
466
522
# Add all the docs in this course at once (usually faster than adding one at a time):
467
- _wait_for_meili_task (client .index (temp_index_name ).add_documents (docs ))
523
+ _wait_for_meili_task (client .index (index_name ).add_documents (docs ))
468
524
return docs
469
525
470
526
paginator = Paginator (CourseOverview .objects .only ('id' , 'display_name' ), 1000 )
@@ -473,10 +529,16 @@ def add_with_children(block):
473
529
status_cb (
474
530
f"{ num_contexts_done + 1 } /{ num_contexts } . Now indexing course { course .display_name } ({ course .id } )"
475
531
)
532
+ if course .id in keys_indexed :
533
+ num_contexts_done += 1
534
+ continue
476
535
course_docs = index_course (course )
536
+ if incremental :
537
+ IncrementalIndexCompleted .objects .get_or_create (context_key = course .id )
477
538
num_contexts_done += 1
478
539
num_blocks_done += len (course_docs )
479
540
541
+ IncrementalIndexCompleted .objects .all ().delete ()
480
542
status_cb (f"Done! { num_blocks_done } blocks indexed across { num_contexts_done } courses, collections and libraries." )
481
543
482
544
0 commit comments