Merge pull request #170 from openedx/saleem-latif/ENT-7294

saleem-latif · web-flow · commit db90fead8350 · 2023-07-25T15:14:02.000+05:00
ENT-7294: memory optimization for job recommendations.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,6 +13,10 @@ Change Log
 
 Unreleased
 
+[1.42.3] - 2023-07-14
+---------------------
+* perf: memory optimisation for job recommendations.
+
 [1.42.2] - 2023-07-14
 ---------------------
 * perf: pandas dataframe loading memory optimisation
diff --git a/taxonomy/__init__.py b/taxonomy/__init__.py
@@ -15,6 +15,6 @@
 # 2. MINOR version when you add functionality in a backwards compatible manner, and
 # 3. PATCH version when you make backwards compatible bug fixes.
 # More details can be found at https://semver.org/
-__version__ = '1.42.2'
+__version__ = '1.42.3'
 
 default_app_config = 'taxonomy.apps.TaxonomyConfig'  # pylint: disable=invalid-name
diff --git a/taxonomy/algolia/utils.py b/taxonomy/algolia/utils.py
@@ -50,41 +50,26 @@ def calculate_jaccard_similarity(set_a, set_b):
     Calculate Jaccard similarity between two sets of job skills.
     """
     try:
-        intersection = set_a.intersection(set_b)
-        jaccard_similarity = len(intersection) / len(set_a.union(set_b))
-        return jaccard_similarity
+        return len(set_a.intersection(set_b)) / len(set_a.union(set_b))
     except ZeroDivisionError:
         return float(0)
 
 
-def fetch_job_skills(job, all_job_skills):
-    """
-    Construct a list of all the job skills from the database.
-
-    Returns:
-        (list<dict>): A list of dicts containing job skills data.
-    """
-    job_skills = all_job_skills.filter(job=job)
-    skills = []
-    for job_skill in job_skills:
-        skills.append(job_skill.skill.name)
-    return skills
-
-
-def combine_jobs_and_skills_data(jobs):
+def combine_jobs_and_skills_data(jobs_qs):
     """
     Combine jobs and skills data.
 
+    Arguments:
+        jobs_qs (QuerySet): Django queryset of Job model that will be used as a starting point to fetch skills data.
+
     Returns:
         (list<dict>): A list of dicts containing job and their skills in a list.
     """
-    jobs = jobs.all()
-    all_job_skills = JobSkills.objects.all()
-
     all_job_and_skills_data = []
-    for job in jobs:
-        all_job_skills = JobSkills.objects.filter(job=job)
-        skills = fetch_job_skills(job, all_job_skills)
+    for job in jobs_qs.all():
+        skills = list(
+            JobSkills.objects.filter(job=job).values_list('skill__name', flat=True)
+        )
         all_job_and_skills_data.append({
             'name': job.name,
             'skills': skills,
@@ -98,7 +83,7 @@ def calculate_job_recommendations(jobs):
     Calculate job recommendations.
 
     Args:
-        job (list<dict>): AA list of dicts containing job and their skills in a list.
+        jobs (list<dict>): A list of dicts containing job and their skills in a list.
 
     Returns:
         (list<dict>): A list of dicts containing jobs and their recommended jobs.
@@ -126,6 +111,7 @@ def calculate_job_recommendations(jobs):
         'jaccard_similarity': "float16",
     }
 
+    LOGGER.info('[TAXONOMY] [DEBUG] Calculating similar jobs for the active jobs.')
     similar_jobs = pd.DataFrame(
         {
             'job': candidate_jobs,
@@ -134,10 +120,19 @@ def calculate_job_recommendations(jobs):
         },
     ).astype(dtype_dict)
 
+    LOGGER.info('[TAXONOMY] [DEBUG] similar jobs data frame created.')
     similar_jobs['rank'] = similar_jobs.groupby('job')['jaccard_similarity'].rank(method='first', ascending=False)
-    mask = (similar_jobs['rank'] <= 3)
-    similar_jobs = similar_jobs[mask].sort_values(by=['job', 'rank'], ascending=[True, True])
 
+    # This line is necessary as it makes sure save the copy of the data frame generated by
+    # `similar_jobs[similar_jobs['rank'] <= 3]` in a new variable.
+    similar_jobs = similar_jobs[similar_jobs['rank'] <= 3]
+    similar_jobs.sort_values(
+        by=['job', 'rank'],
+        ascending=[True, True],
+        inplace=True,
+    )
+
+    LOGGER.info('[TAXONOMY] [DEBUG] similar jobs calculation complete.')
     jobs_and_recommendations = []
     for job in jobs:
         jobs_and_recommendations.append({
@@ -151,14 +146,19 @@ def combine_industry_skills():
     """
     Constructs a dict with keys as industry names and values as their skills.
     """
-    industries = list(Industry.objects.all())
     industries_and_skills = {}
-    for industry in industries:
+    for industry in Industry.objects.all():
         # sum all significances for the same skill and then sort on total significance
         skills = list(
-            IndustryJobSkill.objects.filter(industry=industry).values_list('skill__name', flat=True).annotate(
-                total_significance=Sum('significance')).order_by('-total_significance').distinct()[
-                    :EMBEDDED_OBJECT_LENGTH_CAP]
+            IndustryJobSkill.objects.filter(
+                industry=industry
+            ).values_list(
+                'skill__name', flat=True
+            ).annotate(
+                total_significance=Sum('significance')
+            ).order_by(
+                '-total_significance'
+            ).distinct()[:EMBEDDED_OBJECT_LENGTH_CAP]
         )
         industries_and_skills[industry.name] = skills
     return industries_and_skills