Skip to content

Commit db90fea

Browse files
authored
Merge pull request #170 from openedx/saleem-latif/ENT-7294
ENT-7294: memory optimization for job recommendations.
2 parents 347747d + e551c50 commit db90fea

File tree

3 files changed

+37
-33
lines changed

3 files changed

+37
-33
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ Change Log
1313
1414
Unreleased
1515

16+
[1.42.3] - 2023-07-14
17+
---------------------
18+
* perf: memory optimisation for job recommendations.
19+
1620
[1.42.2] - 2023-07-14
1721
---------------------
1822
* perf: pandas dataframe loading memory optimisation

taxonomy/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,6 @@
1515
# 2. MINOR version when you add functionality in a backwards compatible manner, and
1616
# 3. PATCH version when you make backwards compatible bug fixes.
1717
# More details can be found at https://semver.org/
18-
__version__ = '1.42.2'
18+
__version__ = '1.42.3'
1919

2020
default_app_config = 'taxonomy.apps.TaxonomyConfig' # pylint: disable=invalid-name

taxonomy/algolia/utils.py

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -50,41 +50,26 @@ def calculate_jaccard_similarity(set_a, set_b):
5050
Calculate Jaccard similarity between two sets of job skills.
5151
"""
5252
try:
53-
intersection = set_a.intersection(set_b)
54-
jaccard_similarity = len(intersection) / len(set_a.union(set_b))
55-
return jaccard_similarity
53+
return len(set_a.intersection(set_b)) / len(set_a.union(set_b))
5654
except ZeroDivisionError:
5755
return float(0)
5856

5957

60-
def fetch_job_skills(job, all_job_skills):
61-
"""
62-
Construct a list of all the job skills from the database.
63-
64-
Returns:
65-
(list<dict>): A list of dicts containing job skills data.
66-
"""
67-
job_skills = all_job_skills.filter(job=job)
68-
skills = []
69-
for job_skill in job_skills:
70-
skills.append(job_skill.skill.name)
71-
return skills
72-
73-
74-
def combine_jobs_and_skills_data(jobs):
58+
def combine_jobs_and_skills_data(jobs_qs):
7559
"""
7660
Combine jobs and skills data.
7761
62+
Arguments:
63+
jobs_qs (QuerySet): Django queryset of Job model that will be used as a starting point to fetch skills data.
64+
7865
Returns:
7966
(list<dict>): A list of dicts containing job and their skills in a list.
8067
"""
81-
jobs = jobs.all()
82-
all_job_skills = JobSkills.objects.all()
83-
8468
all_job_and_skills_data = []
85-
for job in jobs:
86-
all_job_skills = JobSkills.objects.filter(job=job)
87-
skills = fetch_job_skills(job, all_job_skills)
69+
for job in jobs_qs.all():
70+
skills = list(
71+
JobSkills.objects.filter(job=job).values_list('skill__name', flat=True)
72+
)
8873
all_job_and_skills_data.append({
8974
'name': job.name,
9075
'skills': skills,
@@ -98,7 +83,7 @@ def calculate_job_recommendations(jobs):
9883
Calculate job recommendations.
9984
10085
Args:
101-
job (list<dict>): AA list of dicts containing job and their skills in a list.
86+
jobs (list<dict>): A list of dicts containing job and their skills in a list.
10287
10388
Returns:
10489
(list<dict>): A list of dicts containing jobs and their recommended jobs.
@@ -126,6 +111,7 @@ def calculate_job_recommendations(jobs):
126111
'jaccard_similarity': "float16",
127112
}
128113

114+
LOGGER.info('[TAXONOMY] [DEBUG] Calculating similar jobs for the active jobs.')
129115
similar_jobs = pd.DataFrame(
130116
{
131117
'job': candidate_jobs,
@@ -134,10 +120,19 @@ def calculate_job_recommendations(jobs):
134120
},
135121
).astype(dtype_dict)
136122

123+
LOGGER.info('[TAXONOMY] [DEBUG] similar jobs data frame created.')
137124
similar_jobs['rank'] = similar_jobs.groupby('job')['jaccard_similarity'].rank(method='first', ascending=False)
138-
mask = (similar_jobs['rank'] <= 3)
139-
similar_jobs = similar_jobs[mask].sort_values(by=['job', 'rank'], ascending=[True, True])
140125

126+
# This line is necessary as it makes sure save the copy of the data frame generated by
127+
# `similar_jobs[similar_jobs['rank'] <= 3]` in a new variable.
128+
similar_jobs = similar_jobs[similar_jobs['rank'] <= 3]
129+
similar_jobs.sort_values(
130+
by=['job', 'rank'],
131+
ascending=[True, True],
132+
inplace=True,
133+
)
134+
135+
LOGGER.info('[TAXONOMY] [DEBUG] similar jobs calculation complete.')
141136
jobs_and_recommendations = []
142137
for job in jobs:
143138
jobs_and_recommendations.append({
@@ -151,14 +146,19 @@ def combine_industry_skills():
151146
"""
152147
Constructs a dict with keys as industry names and values as their skills.
153148
"""
154-
industries = list(Industry.objects.all())
155149
industries_and_skills = {}
156-
for industry in industries:
150+
for industry in Industry.objects.all():
157151
# sum all significances for the same skill and then sort on total significance
158152
skills = list(
159-
IndustryJobSkill.objects.filter(industry=industry).values_list('skill__name', flat=True).annotate(
160-
total_significance=Sum('significance')).order_by('-total_significance').distinct()[
161-
:EMBEDDED_OBJECT_LENGTH_CAP]
153+
IndustryJobSkill.objects.filter(
154+
industry=industry
155+
).values_list(
156+
'skill__name', flat=True
157+
).annotate(
158+
total_significance=Sum('significance')
159+
).order_by(
160+
'-total_significance'
161+
).distinct()[:EMBEDDED_OBJECT_LENGTH_CAP]
162162
)
163163
industries_and_skills[industry.name] = skills
164164
return industries_and_skills

0 commit comments

Comments
 (0)