Merge pull request #172 from openedx/saleem-latif/ENT-7294-updates

ENT-7294] Performance enhancements in job recommendations calculation.
openedx · Jul 31, 2023 · 25e881e · 25e881e
2 parents 8a6788f + e707dc7
commit 25e881e
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 44 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,6 +13,10 @@ Change Log
 
 Unreleased
 
+[1.43.1] - 2023-07-31
+---------------------
+* perf: Performance enhancements in job recomendations calculation.
+
 [1.43.0] - 2023-07-07
 ---------------------
 * feat: reuse tags from similar product for xblock skills.

diff --git a/taxonomy/__init__.py b/taxonomy/__init__.py
@@ -15,6 +15,6 @@
 # 2. MINOR version when you add functionality in a backwards compatible manner, and
 # 3. PATCH version when you make backwards compatible bug fixes.
 # More details can be found at https://semver.org/
-__version__ = '1.42.3'
+__version__ = '1.43.1'
 
 default_app_config = 'taxonomy.apps.TaxonomyConfig'  # pylint: disable=invalid-name
diff --git a/taxonomy/algolia/utils.py b/taxonomy/algolia/utils.py
@@ -4,7 +4,7 @@
 """
 import logging
 import datetime
-import pandas as pd
+from collections import deque, namedtuple
 
 from django.conf import settings
 from django.db.models import Sum
@@ -16,6 +16,8 @@
 
 LOGGER = logging.getLogger(__name__)
 
+JobRecommendation = namedtuple('JobRecommendation', 'name similarity')
+
 
 def index_jobs_data_in_algolia():
     """
@@ -78,6 +80,36 @@ def combine_jobs_and_skills_data(jobs_qs):
     return all_job_and_skills_data
 
 
+def insert_item_in_ordered_queue(queue, item, key=lambda arg: arg):
+    """
+    Insert given job in the jobs list.
+
+    `queue` is assumed to be ordered based on given key in the descending order.
+    `item` is the item to insert in the list, it will be inserted in the correct place.
+
+    Note: item will not be inserted if there is no place for it based on the key.
+
+    Arguments:
+        queue (deque<Any>): A Queue containing list of items.
+        item (Any): Item that needs to be inserted.
+        key (func): Optional key to get the comparable attribute of the item.
+    """
+    for index, element in enumerate(queue):
+        if key(item) > key(element):
+            if len(queue) == queue.maxlen:
+                # remove the last element of the queue to avoid index error.
+                queue.pop()
+            queue.insert(index, item)
+
+            # Item is inserted, return here.
+            return
+
+    # If item could not be inserted, then check for available space, and insert the item if there is space.
+    if len(queue) != queue.maxlen:
+        queue.append(item)
+        return
+
+
 def calculate_job_recommendations(jobs):
     """
     Calculate job recommendations.
@@ -88,57 +120,33 @@ def calculate_job_recommendations(jobs):
     Returns:
         (list<dict>): A list of dicts containing jobs and their recommended jobs.
     """
-    candidate_jobs = []
-    matching_jobs = []
-    jaccard_similarities = []
+    SIMILAR_JOBS_COUNT = 3
+    job_recommendations = deque([], maxlen=SIMILAR_JOBS_COUNT)
+    jobs_and_recommendations = []
 
-    for job in jobs:
-        job_skills_set = set(job['skills'])
+    # converting skills list into set, to avoid repeated converting in the nested loop.
+    jobs = [
+        {'name': job['name'], 'skills': set(job['skills'])} for job in jobs
+    ]
 
+    for job in jobs:
         for candidate_job in jobs:
             if job['name'] == candidate_job['name']:
                 continue
 
-            other_job_skills_set = set(candidate_job['skills'])
-            jaccard_similarity = calculate_jaccard_similarity(job_skills_set, other_job_skills_set)
-            candidate_jobs.append(job['name'])
-            matching_jobs.append(candidate_job['name'])
-            jaccard_similarities.append(jaccard_similarity)
-
-    dtype_dict = {
-        'job': "category",
-        'matching_job': "category",
-        'jaccard_similarity': "float16",
-    }
-
-    LOGGER.info('[TAXONOMY] [DEBUG] Calculating similar jobs for the active jobs.')
-    similar_jobs = pd.DataFrame(
-        {
-            'job': candidate_jobs,
-            'matching_job': matching_jobs,
-            'jaccard_similarity': jaccard_similarities,
-        },
-    ).astype(dtype_dict)
-
-    LOGGER.info('[TAXONOMY] [DEBUG] similar jobs data frame created.')
-    similar_jobs['rank'] = similar_jobs.groupby('job')['jaccard_similarity'].rank(method='first', ascending=False)
-
-    # This line is necessary as it makes sure save the copy of the data frame generated by
-    # `similar_jobs[similar_jobs['rank'] <= 3]` in a new variable.
-    similar_jobs = similar_jobs[similar_jobs['rank'] <= 3]
-    similar_jobs.sort_values(
-        by=['job', 'rank'],
-        ascending=[True, True],
-        inplace=True,
-    )
+            jaccard_similarity = calculate_jaccard_similarity(job['skills'], candidate_job['skills'])
+
+            insert_item_in_ordered_queue(
+                queue=job_recommendations,
+                item=JobRecommendation(job['name'], jaccard_similarity),
+                key=lambda item: item.similarity,
+            )
 
-    LOGGER.info('[TAXONOMY] [DEBUG] similar jobs calculation complete.')
-    jobs_and_recommendations = []
-    for job in jobs:
         jobs_and_recommendations.append({
             'name': job['name'],
-            'similar_jobs': similar_jobs[similar_jobs['job'] == job['name']]['matching_job'].tolist(),
+            'similar_jobs': [item.name for item in job_recommendations],
         })
+
     return jobs_and_recommendations
 
 

diff --git a/tests/algolia/test_utils.py b/tests/algolia/test_utils.py
@@ -1,11 +1,19 @@
 """
 Tests for algolia utility functions.
 """
+from collections import deque
+
 import mock
 from pytest import mark
 
 from taxonomy.algolia.constants import ALGOLIA_JOBS_INDEX_SETTINGS
-from taxonomy.algolia.utils import calculate_jaccard_similarity, fetch_jobs_data, index_jobs_data_in_algolia
+from taxonomy.algolia.utils import (
+    JobRecommendation,
+    calculate_jaccard_similarity,
+    fetch_jobs_data,
+    index_jobs_data_in_algolia,
+    insert_item_in_ordered_queue,
+)
 from test_utils import factories
 from test_utils.testcase import TaxonomyTestCase
 
@@ -211,3 +219,56 @@ def test_index_jobs_data_in_algolia(self, algolia_search_client_mock):
 
         set_settings_mock.assert_called_once_with(ALGOLIA_JOBS_INDEX_SETTINGS)
         replace_all_objects_mock.assert_called_once_with(mock.ANY, mock.ANY)
+
+    def test_insert_item_in_ordered_queue(self):
+        """
+        Test insert_item_in_ordered_queue works as expected.
+        """
+        queue = deque([], maxlen=3)
+
+        # Insert first 3 jobs and make sure they are get inserted in the right order.
+        insert_item_in_ordered_queue(queue, JobRecommendation('job-2', 2), key=lambda job: job.similarity)
+        insert_item_in_ordered_queue(queue, JobRecommendation('job-1', 1), key=lambda job: job.similarity)
+        insert_item_in_ordered_queue(queue, JobRecommendation('job-3', 3), key=lambda job: job.similarity)
+
+        assert queue[0].name == 'job-3'
+        assert queue[1].name == 'job-2'
+        assert queue[2].name == 'job-1'
+
+        # Now, insert an element in the start and make sure it is inserted in the correct spot.
+        insert_item_in_ordered_queue(queue, JobRecommendation('job-4', 4), key=lambda job: job.similarity)
+        assert len(queue) == 3
+        assert queue[0].name == 'job-4'
+        assert queue[1].name == 'job-3'
+        assert queue[2].name == 'job-2'
+
+        # Now, insert an element in the middle and make sure it is inserted in the correct spot.
+        insert_item_in_ordered_queue(queue, JobRecommendation('job-3.5', 3.5), key=lambda job: job.similarity)
+        assert len(queue) == 3
+        assert queue[0].name == 'job-4'
+        assert queue[1].name == 'job-3.5'
+        assert queue[2].name == 'job-3'
+
+        # Now, insert an element in the end and make sure it is inserted in the correct spot.
+        insert_item_in_ordered_queue(queue, JobRecommendation('job-3.1', 3.1), key=lambda job: job.similarity)
+        assert len(queue) == 3
+        assert queue[0].name == 'job-4'
+        assert queue[1].name == 'job-3.5'
+        assert queue[2].name == 'job-3.1'
+
+        # Now, try to insert an element that should not be added and make sure it is not inserted.
+        insert_item_in_ordered_queue(queue, JobRecommendation('job-2', 2), key=lambda job: job.similarity)
+        assert len(queue) == 3
+        assert queue[0].name == 'job-4'
+        assert queue[1].name == 'job-3.5'
+        assert queue[2].name == 'job-3.1'
+
+        # test with default key
+        queue = deque([], maxlen=3)
+        insert_item_in_ordered_queue(queue, 1)
+        insert_item_in_ordered_queue(queue, 3)
+        insert_item_in_ordered_queue(queue, 2)
+
+        assert queue[0] == 3
+        assert queue[1] == 2
+        assert queue[2] == 1