Skip to content

Commit

Permalink
Merge pull request #172 from openedx/saleem-latif/ENT-7294-updates
Browse files Browse the repository at this point in the history
ENT-7294] Performance enhancements in job recommendations calculation.
  • Loading branch information
saleem-latif authored Jul 31, 2023
2 parents 8a6788f + e707dc7 commit 25e881e
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 44 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ Change Log
Unreleased

[1.43.1] - 2023-07-31
---------------------
* perf: Performance enhancements in job recomendations calculation.

[1.43.0] - 2023-07-07
---------------------
* feat: reuse tags from similar product for xblock skills.
Expand Down
2 changes: 1 addition & 1 deletion taxonomy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
# 2. MINOR version when you add functionality in a backwards compatible manner, and
# 3. PATCH version when you make backwards compatible bug fixes.
# More details can be found at https://semver.org/
__version__ = '1.42.3'
__version__ = '1.43.1'

default_app_config = 'taxonomy.apps.TaxonomyConfig' # pylint: disable=invalid-name
92 changes: 50 additions & 42 deletions taxonomy/algolia/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
import logging
import datetime
import pandas as pd
from collections import deque, namedtuple

from django.conf import settings
from django.db.models import Sum
Expand All @@ -16,6 +16,8 @@

LOGGER = logging.getLogger(__name__)

JobRecommendation = namedtuple('JobRecommendation', 'name similarity')


def index_jobs_data_in_algolia():
"""
Expand Down Expand Up @@ -78,6 +80,36 @@ def combine_jobs_and_skills_data(jobs_qs):
return all_job_and_skills_data


def insert_item_in_ordered_queue(queue, item, key=lambda arg: arg):
"""
Insert given job in the jobs list.
`queue` is assumed to be ordered based on given key in the descending order.
`item` is the item to insert in the list, it will be inserted in the correct place.
Note: item will not be inserted if there is no place for it based on the key.
Arguments:
queue (deque<Any>): A Queue containing list of items.
item (Any): Item that needs to be inserted.
key (func): Optional key to get the comparable attribute of the item.
"""
for index, element in enumerate(queue):
if key(item) > key(element):
if len(queue) == queue.maxlen:
# remove the last element of the queue to avoid index error.
queue.pop()
queue.insert(index, item)

# Item is inserted, return here.
return

# If item could not be inserted, then check for available space, and insert the item if there is space.
if len(queue) != queue.maxlen:
queue.append(item)
return


def calculate_job_recommendations(jobs):
"""
Calculate job recommendations.
Expand All @@ -88,57 +120,33 @@ def calculate_job_recommendations(jobs):
Returns:
(list<dict>): A list of dicts containing jobs and their recommended jobs.
"""
candidate_jobs = []
matching_jobs = []
jaccard_similarities = []
SIMILAR_JOBS_COUNT = 3
job_recommendations = deque([], maxlen=SIMILAR_JOBS_COUNT)
jobs_and_recommendations = []

for job in jobs:
job_skills_set = set(job['skills'])
# converting skills list into set, to avoid repeated converting in the nested loop.
jobs = [
{'name': job['name'], 'skills': set(job['skills'])} for job in jobs
]

for job in jobs:
for candidate_job in jobs:
if job['name'] == candidate_job['name']:
continue

other_job_skills_set = set(candidate_job['skills'])
jaccard_similarity = calculate_jaccard_similarity(job_skills_set, other_job_skills_set)
candidate_jobs.append(job['name'])
matching_jobs.append(candidate_job['name'])
jaccard_similarities.append(jaccard_similarity)

dtype_dict = {
'job': "category",
'matching_job': "category",
'jaccard_similarity': "float16",
}

LOGGER.info('[TAXONOMY] [DEBUG] Calculating similar jobs for the active jobs.')
similar_jobs = pd.DataFrame(
{
'job': candidate_jobs,
'matching_job': matching_jobs,
'jaccard_similarity': jaccard_similarities,
},
).astype(dtype_dict)

LOGGER.info('[TAXONOMY] [DEBUG] similar jobs data frame created.')
similar_jobs['rank'] = similar_jobs.groupby('job')['jaccard_similarity'].rank(method='first', ascending=False)

# This line is necessary as it makes sure save the copy of the data frame generated by
# `similar_jobs[similar_jobs['rank'] <= 3]` in a new variable.
similar_jobs = similar_jobs[similar_jobs['rank'] <= 3]
similar_jobs.sort_values(
by=['job', 'rank'],
ascending=[True, True],
inplace=True,
)
jaccard_similarity = calculate_jaccard_similarity(job['skills'], candidate_job['skills'])

insert_item_in_ordered_queue(
queue=job_recommendations,
item=JobRecommendation(job['name'], jaccard_similarity),
key=lambda item: item.similarity,
)

LOGGER.info('[TAXONOMY] [DEBUG] similar jobs calculation complete.')
jobs_and_recommendations = []
for job in jobs:
jobs_and_recommendations.append({
'name': job['name'],
'similar_jobs': similar_jobs[similar_jobs['job'] == job['name']]['matching_job'].tolist(),
'similar_jobs': [item.name for item in job_recommendations],
})

return jobs_and_recommendations


Expand Down
63 changes: 62 additions & 1 deletion tests/algolia/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
"""
Tests for algolia utility functions.
"""
from collections import deque

import mock
from pytest import mark

from taxonomy.algolia.constants import ALGOLIA_JOBS_INDEX_SETTINGS
from taxonomy.algolia.utils import calculate_jaccard_similarity, fetch_jobs_data, index_jobs_data_in_algolia
from taxonomy.algolia.utils import (
JobRecommendation,
calculate_jaccard_similarity,
fetch_jobs_data,
index_jobs_data_in_algolia,
insert_item_in_ordered_queue,
)
from test_utils import factories
from test_utils.testcase import TaxonomyTestCase

Expand Down Expand Up @@ -211,3 +219,56 @@ def test_index_jobs_data_in_algolia(self, algolia_search_client_mock):

set_settings_mock.assert_called_once_with(ALGOLIA_JOBS_INDEX_SETTINGS)
replace_all_objects_mock.assert_called_once_with(mock.ANY, mock.ANY)

def test_insert_item_in_ordered_queue(self):
"""
Test insert_item_in_ordered_queue works as expected.
"""
queue = deque([], maxlen=3)

# Insert first 3 jobs and make sure they are get inserted in the right order.
insert_item_in_ordered_queue(queue, JobRecommendation('job-2', 2), key=lambda job: job.similarity)
insert_item_in_ordered_queue(queue, JobRecommendation('job-1', 1), key=lambda job: job.similarity)
insert_item_in_ordered_queue(queue, JobRecommendation('job-3', 3), key=lambda job: job.similarity)

assert queue[0].name == 'job-3'
assert queue[1].name == 'job-2'
assert queue[2].name == 'job-1'

# Now, insert an element in the start and make sure it is inserted in the correct spot.
insert_item_in_ordered_queue(queue, JobRecommendation('job-4', 4), key=lambda job: job.similarity)
assert len(queue) == 3
assert queue[0].name == 'job-4'
assert queue[1].name == 'job-3'
assert queue[2].name == 'job-2'

# Now, insert an element in the middle and make sure it is inserted in the correct spot.
insert_item_in_ordered_queue(queue, JobRecommendation('job-3.5', 3.5), key=lambda job: job.similarity)
assert len(queue) == 3
assert queue[0].name == 'job-4'
assert queue[1].name == 'job-3.5'
assert queue[2].name == 'job-3'

# Now, insert an element in the end and make sure it is inserted in the correct spot.
insert_item_in_ordered_queue(queue, JobRecommendation('job-3.1', 3.1), key=lambda job: job.similarity)
assert len(queue) == 3
assert queue[0].name == 'job-4'
assert queue[1].name == 'job-3.5'
assert queue[2].name == 'job-3.1'

# Now, try to insert an element that should not be added and make sure it is not inserted.
insert_item_in_ordered_queue(queue, JobRecommendation('job-2', 2), key=lambda job: job.similarity)
assert len(queue) == 3
assert queue[0].name == 'job-4'
assert queue[1].name == 'job-3.5'
assert queue[2].name == 'job-3.1'

# test with default key
queue = deque([], maxlen=3)
insert_item_in_ordered_queue(queue, 1)
insert_item_in_ordered_queue(queue, 3)
insert_item_in_ordered_queue(queue, 2)

assert queue[0] == 3
assert queue[1] == 2
assert queue[2] == 1

0 comments on commit 25e881e

Please sign in to comment.