Skip to content

Commit b07464b

Browse files
authored
feat: incremental reindex_studio management command (#35864)
This allows large instances to run an (interruptable, resumable) reindex task that can cover thousands of courses.
1 parent 3196ceb commit b07464b

File tree

6 files changed

+342
-84
lines changed

6 files changed

+342
-84
lines changed

openedx/core/djangoapps/content/search/api.py

+142-80
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import logging
77
import time
8-
from contextlib import contextmanager
8+
from contextlib import contextmanager, nullcontext
99
from datetime import datetime, timedelta, timezone
1010
from functools import wraps
1111
from typing import Callable, Generator
@@ -24,7 +24,14 @@
2424
from rest_framework.request import Request
2525
from common.djangoapps.student.role_helpers import get_course_roles
2626
from openedx.core.djangoapps.content.course_overviews.models import CourseOverview
27-
from openedx.core.djangoapps.content.search.models import get_access_ids_for_request
27+
from openedx.core.djangoapps.content.search.models import get_access_ids_for_request, IncrementalIndexCompleted
28+
from openedx.core.djangoapps.content.search.index_config import (
29+
INDEX_DISTINCT_ATTRIBUTE,
30+
INDEX_FILTERABLE_ATTRIBUTES,
31+
INDEX_SEARCHABLE_ATTRIBUTES,
32+
INDEX_SORTABLE_ATTRIBUTES,
33+
INDEX_RANKING_RULES,
34+
)
2835
from openedx.core.djangoapps.content_libraries import api as lib_api
2936
from xmodule.modulestore.django import modulestore
3037

@@ -217,6 +224,42 @@ def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generat
217224
_wait_for_meili_task(client.delete_index(temp_index_name))
218225

219226

227+
def _index_is_empty(index_name: str) -> bool:
228+
"""
229+
Check if an index is empty
230+
231+
Args:
232+
index_name (str): The name of the index to check
233+
"""
234+
client = _get_meilisearch_client()
235+
index = client.get_index(index_name)
236+
return index.get_stats().number_of_documents == 0
237+
238+
239+
def _configure_index(index_name):
240+
"""
241+
Configure the index. The following index settings are best changed on an empty index.
242+
Changing them on a populated index will "re-index all documents in the index", which can take some time.
243+
244+
Args:
245+
index_name (str): The name of the index to configure
246+
"""
247+
client = _get_meilisearch_client()
248+
249+
# Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique):
250+
client.index(index_name).update_distinct_attribute(INDEX_DISTINCT_ATTRIBUTE)
251+
# Mark which attributes can be used for filtering/faceted search:
252+
client.index(index_name).update_filterable_attributes(INDEX_FILTERABLE_ATTRIBUTES)
253+
# Mark which attributes are used for keyword search, in order of importance:
254+
client.index(index_name).update_searchable_attributes(INDEX_SEARCHABLE_ATTRIBUTES)
255+
# Mark which attributes can be used for sorting search results:
256+
client.index(index_name).update_sortable_attributes(INDEX_SORTABLE_ATTRIBUTES)
257+
258+
# Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
259+
# cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy
260+
client.index(index_name).update_ranking_rules(INDEX_RANKING_RULES)
261+
262+
220263
def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None:
221264
"""
222265
Recurse the children of an XBlock and call the given function for each
@@ -279,8 +322,75 @@ def is_meilisearch_enabled() -> bool:
279322
return False
280323

281324

282-
# pylint: disable=too-many-statements
283-
def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
325+
def reset_index(status_cb: Callable[[str], None] | None = None) -> None:
326+
"""
327+
Reset the Meilisearch index, deleting all documents and reconfiguring it
328+
"""
329+
if status_cb is None:
330+
status_cb = log.info
331+
332+
status_cb("Creating new empty index...")
333+
with _using_temp_index(status_cb) as temp_index_name:
334+
_configure_index(temp_index_name)
335+
status_cb("Index recreated!")
336+
status_cb("Index reset complete.")
337+
338+
339+
def _is_index_configured(index_name: str) -> bool:
340+
"""
341+
Check if an index is completely configured
342+
343+
Args:
344+
index_name (str): The name of the index to check
345+
"""
346+
client = _get_meilisearch_client()
347+
index = client.get_index(index_name)
348+
index_settings = index.get_settings()
349+
for k, v in (
350+
("distinctAttribute", INDEX_DISTINCT_ATTRIBUTE),
351+
("filterableAttributes", INDEX_FILTERABLE_ATTRIBUTES),
352+
("searchableAttributes", INDEX_SEARCHABLE_ATTRIBUTES),
353+
("sortableAttributes", INDEX_SORTABLE_ATTRIBUTES),
354+
("rankingRules", INDEX_RANKING_RULES),
355+
):
356+
setting = index_settings.get(k, [])
357+
if isinstance(v, list):
358+
v = set(v)
359+
setting = set(setting)
360+
if setting != v:
361+
return False
362+
return True
363+
364+
365+
def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable[[str], None] | None = None) -> None:
366+
"""
367+
Initialize the Meilisearch index, creating it and configuring it if it doesn't exist
368+
"""
369+
if status_cb is None:
370+
status_cb = log.info
371+
if warn_cb is None:
372+
warn_cb = log.warning
373+
374+
if _index_exists(STUDIO_INDEX_NAME):
375+
if _index_is_empty(STUDIO_INDEX_NAME):
376+
warn_cb(
377+
"The studio search index is empty. Please run ./manage.py cms reindex_studio"
378+
" --experimental [--incremental]"
379+
)
380+
return
381+
if not _is_index_configured(STUDIO_INDEX_NAME):
382+
warn_cb(
383+
"A rebuild of the index is required. Please run ./manage.py cms reindex_studio"
384+
" --experimental [--incremental]"
385+
)
386+
return
387+
status_cb("Index already exists and is configured.")
388+
return
389+
390+
reset_index(status_cb)
391+
392+
393+
def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=False) -> None: # lint-amnesty, pylint: disable=too-many-statements
284394
"""
285395
Rebuild the Meilisearch index from scratch
286396
"""
@@ -292,96 +402,40 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
292402

293403
# Get the lists of libraries
294404
status_cb("Counting libraries...")
295-
lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')]
405+
keys_indexed = []
406+
if incremental:
407+
keys_indexed = list(IncrementalIndexCompleted.objects.values_list("context_key", flat=True))
408+
lib_keys = [
409+
lib.library_key
410+
for lib in lib_api.ContentLibrary.objects.select_related("org").only("org", "slug").order_by("-id")
411+
if lib.library_key not in keys_indexed
412+
]
296413
num_libraries = len(lib_keys)
297414

298415
# Get the list of courses
299416
status_cb("Counting courses...")
300417
num_courses = CourseOverview.objects.count()
301418

302419
# Some counters so we can track our progress as indexing progresses:
303-
num_contexts = num_courses + num_libraries
304-
num_contexts_done = 0 # How many courses/libraries we've indexed
420+
num_libs_skipped = len(keys_indexed)
421+
num_contexts = num_courses + num_libraries + num_libs_skipped
422+
num_contexts_done = 0 + num_libs_skipped # How many courses/libraries we've indexed
305423
num_blocks_done = 0 # How many individual components/XBlocks we've indexed
306424

307425
status_cb(f"Found {num_courses} courses, {num_libraries} libraries.")
308-
with _using_temp_index(status_cb) as temp_index_name:
426+
with _using_temp_index(status_cb) if not incremental else nullcontext(STUDIO_INDEX_NAME) as index_name:
309427
############## Configure the index ##############
310428

311-
# The following index settings are best changed on an empty index.
312-
# Changing them on a populated index will "re-index all documents in the index, which can take some time"
429+
# The index settings are best changed on an empty index.
430+
# Changing them on a populated index will "re-index all documents in the index", which can take some time
313431
# and use more RAM. Instead, we configure an empty index then populate it one course/library at a time.
314-
315-
# Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique):
316-
client.index(temp_index_name).update_distinct_attribute(Fields.usage_key)
317-
# Mark which attributes can be used for filtering/faceted search:
318-
client.index(temp_index_name).update_filterable_attributes([
319-
# Get specific block/collection using combination of block_id and context_key
320-
Fields.block_id,
321-
Fields.block_type,
322-
Fields.context_key,
323-
Fields.usage_key,
324-
Fields.org,
325-
Fields.tags,
326-
Fields.tags + "." + Fields.tags_taxonomy,
327-
Fields.tags + "." + Fields.tags_level0,
328-
Fields.tags + "." + Fields.tags_level1,
329-
Fields.tags + "." + Fields.tags_level2,
330-
Fields.tags + "." + Fields.tags_level3,
331-
Fields.collections,
332-
Fields.collections + "." + Fields.collections_display_name,
333-
Fields.collections + "." + Fields.collections_key,
334-
Fields.type,
335-
Fields.access_id,
336-
Fields.last_published,
337-
Fields.content + "." + Fields.problem_types,
338-
])
339-
# Mark which attributes are used for keyword search, in order of importance:
340-
client.index(temp_index_name).update_searchable_attributes([
341-
# Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields.
342-
Fields.display_name,
343-
Fields.block_id,
344-
Fields.content,
345-
Fields.description,
346-
Fields.tags,
347-
Fields.collections,
348-
# If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
349-
# are searchable only if at least one document in the index has a value. If we didn't list them here and,
350-
# say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
351-
# these sub-fields: "Attribute `tags.level3` is not searchable."
352-
Fields.tags + "." + Fields.tags_taxonomy,
353-
Fields.tags + "." + Fields.tags_level0,
354-
Fields.tags + "." + Fields.tags_level1,
355-
Fields.tags + "." + Fields.tags_level2,
356-
Fields.tags + "." + Fields.tags_level3,
357-
Fields.collections + "." + Fields.collections_display_name,
358-
Fields.collections + "." + Fields.collections_key,
359-
Fields.published + "." + Fields.display_name,
360-
Fields.published + "." + Fields.published_description,
361-
])
362-
# Mark which attributes can be used for sorting search results:
363-
client.index(temp_index_name).update_sortable_attributes([
364-
Fields.display_name,
365-
Fields.created,
366-
Fields.modified,
367-
Fields.last_published,
368-
])
369-
370-
# Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
371-
# cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy
372-
client.index(temp_index_name).update_ranking_rules([
373-
"sort",
374-
"words",
375-
"typo",
376-
"proximity",
377-
"attribute",
378-
"exactness",
379-
])
432+
if not incremental:
433+
_configure_index(index_name)
380434

381435
############## Libraries ##############
382436
status_cb("Indexing libraries...")
383437

384-
def index_library(lib_key: str) -> list:
438+
def index_library(lib_key: LibraryLocatorV2) -> list:
385439
docs = []
386440
for component in lib_api.get_library_components(lib_key):
387441
try:
@@ -396,7 +450,7 @@ def index_library(lib_key: str) -> list:
396450
if docs:
397451
try:
398452
# Add all the docs in this library at once (usually faster than adding one at a time):
399-
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
453+
_wait_for_meili_task(client.index(index_name).add_documents(docs))
400454
except (TypeError, KeyError, MeilisearchError) as err:
401455
status_cb(f"Error indexing library {lib_key}: {err}")
402456
return docs
@@ -416,7 +470,7 @@ def index_collection_batch(batch, num_done, library_key) -> int:
416470
if docs:
417471
try:
418472
# Add docs in batch of 100 at once (usually faster than adding one at a time):
419-
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
473+
_wait_for_meili_task(client.index(index_name).add_documents(docs))
420474
except (TypeError, KeyError, MeilisearchError) as err:
421475
status_cb(f"Error indexing collection batch {p}: {err}")
422476
return num_done
@@ -439,6 +493,8 @@ def index_collection_batch(batch, num_done, library_key) -> int:
439493
num_collections_done,
440494
lib_key,
441495
)
496+
if incremental:
497+
IncrementalIndexCompleted.objects.get_or_create(context_key=lib_key)
442498
status_cb(f"{num_collections_done}/{num_collections} collections indexed for library {lib_key}")
443499

444500
num_contexts_done += 1
@@ -464,7 +520,7 @@ def add_with_children(block):
464520

465521
if docs:
466522
# Add all the docs in this course at once (usually faster than adding one at a time):
467-
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
523+
_wait_for_meili_task(client.index(index_name).add_documents(docs))
468524
return docs
469525

470526
paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000)
@@ -473,10 +529,16 @@ def add_with_children(block):
473529
status_cb(
474530
f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})"
475531
)
532+
if course.id in keys_indexed:
533+
num_contexts_done += 1
534+
continue
476535
course_docs = index_course(course)
536+
if incremental:
537+
IncrementalIndexCompleted.objects.get_or_create(context_key=course.id)
477538
num_contexts_done += 1
478539
num_blocks_done += len(course_docs)
479540

541+
IncrementalIndexCompleted.objects.all().delete()
480542
status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.")
481543

482544

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""Configuration for the search index."""
2+
from .documents import Fields
3+
4+
5+
INDEX_DISTINCT_ATTRIBUTE = "usage_key"
6+
7+
# Mark which attributes can be used for filtering/faceted search:
8+
INDEX_FILTERABLE_ATTRIBUTES = [
9+
# Get specific block/collection using combination of block_id and context_key
10+
Fields.block_id,
11+
Fields.block_type,
12+
Fields.context_key,
13+
Fields.usage_key,
14+
Fields.org,
15+
Fields.tags,
16+
Fields.tags + "." + Fields.tags_taxonomy,
17+
Fields.tags + "." + Fields.tags_level0,
18+
Fields.tags + "." + Fields.tags_level1,
19+
Fields.tags + "." + Fields.tags_level2,
20+
Fields.tags + "." + Fields.tags_level3,
21+
Fields.collections,
22+
Fields.collections + "." + Fields.collections_display_name,
23+
Fields.collections + "." + Fields.collections_key,
24+
Fields.type,
25+
Fields.access_id,
26+
Fields.last_published,
27+
Fields.content + "." + Fields.problem_types,
28+
]
29+
30+
# Mark which attributes are used for keyword search, in order of importance:
31+
INDEX_SEARCHABLE_ATTRIBUTES = [
32+
# Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields.
33+
Fields.display_name,
34+
Fields.block_id,
35+
Fields.content,
36+
Fields.description,
37+
Fields.tags,
38+
Fields.collections,
39+
# If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
40+
# are searchable only if at least one document in the index has a value. If we didn't list them here and,
41+
# say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
42+
# these sub-fields: "Attribute `tags.level3` is not searchable."
43+
Fields.tags + "." + Fields.tags_taxonomy,
44+
Fields.tags + "." + Fields.tags_level0,
45+
Fields.tags + "." + Fields.tags_level1,
46+
Fields.tags + "." + Fields.tags_level2,
47+
Fields.tags + "." + Fields.tags_level3,
48+
Fields.collections + "." + Fields.collections_display_name,
49+
Fields.collections + "." + Fields.collections_key,
50+
Fields.published + "." + Fields.display_name,
51+
Fields.published + "." + Fields.published_description,
52+
]
53+
54+
# Mark which attributes can be used for sorting search results:
55+
INDEX_SORTABLE_ATTRIBUTES = [
56+
Fields.display_name,
57+
Fields.created,
58+
Fields.modified,
59+
Fields.last_published,
60+
]
61+
62+
# Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
63+
INDEX_RANKING_RULES = [
64+
"sort",
65+
"words",
66+
"typo",
67+
"proximity",
68+
"attribute",
69+
"exactness",
70+
]

0 commit comments

Comments
 (0)