Skip to content

Commit

Permalink
Merge pull request #170 from openedx/saleem-latif/ENT-7294
Browse files Browse the repository at this point in the history
ENT-7294: memory optimization for job recommendations.
  • Loading branch information
saleem-latif authored Jul 25, 2023
2 parents 347747d + e551c50 commit db90fea
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 33 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ Change Log
Unreleased

[1.42.3] - 2023-07-14
---------------------
* perf: memory optimisation for job recommendations.

[1.42.2] - 2023-07-14
---------------------
* perf: pandas dataframe loading memory optimisation
Expand Down
2 changes: 1 addition & 1 deletion taxonomy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
# 2. MINOR version when you add functionality in a backwards compatible manner, and
# 3. PATCH version when you make backwards compatible bug fixes.
# More details can be found at https://semver.org/
__version__ = '1.42.2'
__version__ = '1.42.3'

default_app_config = 'taxonomy.apps.TaxonomyConfig' # pylint: disable=invalid-name
64 changes: 32 additions & 32 deletions taxonomy/algolia/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,41 +50,26 @@ def calculate_jaccard_similarity(set_a, set_b):
Calculate Jaccard similarity between two sets of job skills.
"""
try:
intersection = set_a.intersection(set_b)
jaccard_similarity = len(intersection) / len(set_a.union(set_b))
return jaccard_similarity
return len(set_a.intersection(set_b)) / len(set_a.union(set_b))
except ZeroDivisionError:
return float(0)


def fetch_job_skills(job, all_job_skills):
"""
Construct a list of all the job skills from the database.
Returns:
(list<dict>): A list of dicts containing job skills data.
"""
job_skills = all_job_skills.filter(job=job)
skills = []
for job_skill in job_skills:
skills.append(job_skill.skill.name)
return skills


def combine_jobs_and_skills_data(jobs):
def combine_jobs_and_skills_data(jobs_qs):
"""
Combine jobs and skills data.
Arguments:
jobs_qs (QuerySet): Django queryset of Job model that will be used as a starting point to fetch skills data.
Returns:
(list<dict>): A list of dicts containing job and their skills in a list.
"""
jobs = jobs.all()
all_job_skills = JobSkills.objects.all()

all_job_and_skills_data = []
for job in jobs:
all_job_skills = JobSkills.objects.filter(job=job)
skills = fetch_job_skills(job, all_job_skills)
for job in jobs_qs.all():
skills = list(
JobSkills.objects.filter(job=job).values_list('skill__name', flat=True)
)
all_job_and_skills_data.append({
'name': job.name,
'skills': skills,
Expand All @@ -98,7 +83,7 @@ def calculate_job_recommendations(jobs):
Calculate job recommendations.
Args:
job (list<dict>): AA list of dicts containing job and their skills in a list.
jobs (list<dict>): A list of dicts containing job and their skills in a list.
Returns:
(list<dict>): A list of dicts containing jobs and their recommended jobs.
Expand Down Expand Up @@ -126,6 +111,7 @@ def calculate_job_recommendations(jobs):
'jaccard_similarity': "float16",
}

LOGGER.info('[TAXONOMY] [DEBUG] Calculating similar jobs for the active jobs.')
similar_jobs = pd.DataFrame(
{
'job': candidate_jobs,
Expand All @@ -134,10 +120,19 @@ def calculate_job_recommendations(jobs):
},
).astype(dtype_dict)

LOGGER.info('[TAXONOMY] [DEBUG] similar jobs data frame created.')
similar_jobs['rank'] = similar_jobs.groupby('job')['jaccard_similarity'].rank(method='first', ascending=False)
mask = (similar_jobs['rank'] <= 3)
similar_jobs = similar_jobs[mask].sort_values(by=['job', 'rank'], ascending=[True, True])

# This line is necessary as it makes sure save the copy of the data frame generated by
# `similar_jobs[similar_jobs['rank'] <= 3]` in a new variable.
similar_jobs = similar_jobs[similar_jobs['rank'] <= 3]
similar_jobs.sort_values(
by=['job', 'rank'],
ascending=[True, True],
inplace=True,
)

LOGGER.info('[TAXONOMY] [DEBUG] similar jobs calculation complete.')
jobs_and_recommendations = []
for job in jobs:
jobs_and_recommendations.append({
Expand All @@ -151,14 +146,19 @@ def combine_industry_skills():
"""
Constructs a dict with keys as industry names and values as their skills.
"""
industries = list(Industry.objects.all())
industries_and_skills = {}
for industry in industries:
for industry in Industry.objects.all():
# sum all significances for the same skill and then sort on total significance
skills = list(
IndustryJobSkill.objects.filter(industry=industry).values_list('skill__name', flat=True).annotate(
total_significance=Sum('significance')).order_by('-total_significance').distinct()[
:EMBEDDED_OBJECT_LENGTH_CAP]
IndustryJobSkill.objects.filter(
industry=industry
).values_list(
'skill__name', flat=True
).annotate(
total_significance=Sum('significance')
).order_by(
'-total_significance'
).distinct()[:EMBEDDED_OBJECT_LENGTH_CAP]
)
industries_and_skills[industry.name] = skills
return industries_and_skills
Expand Down

0 comments on commit db90fea

Please sign in to comment.