fix: chunked data in EMSI client for xblock-skills job (#182)

mahamakifdar19 · web-flow · commit 50d77ac87e56 · 2023-09-13T15:09:38.000+05:00
* fix: chunked data at 50000 byte in EMSI client for xblock-skills job

* fix: Added unit test
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,6 +13,10 @@ Change Log
 
 Unreleased
 
+[1.44.2] - 2023-09-11
+---------------------
+* fix: chunked data at 50000 byte in EMSI client for xblock-skills job
+
 [1.44.1] - 2023-08-25
 ---------------------
 * feat: add prefetch related to the whitelisted product skills
diff --git a/taxonomy/__init__.py b/taxonomy/__init__.py
@@ -15,6 +15,6 @@
 # 2. MINOR version when you add functionality in a backwards compatible manner, and
 # 3. PATCH version when you make backwards compatible bug fixes.
 # More details can be found at https://semver.org/
-__version__ = '1.44.1'
+__version__ = '1.44.2'
 
 default_app_config = 'taxonomy.apps.TaxonomyConfig'  # pylint: disable=invalid-name
diff --git a/taxonomy/emsi/client.py b/taxonomy/emsi/client.py
@@ -183,6 +183,7 @@ class EMSISkillsApiClient(JwtEMSIApiClient):
     """
 
     API_BASE_URL = urljoin(JwtEMSIApiClient.API_BASE_URL, '/skills/versions/8.9')
+    MAX_LIGHTCAST_DATA_SIZE = 50000  # Maximum 50,000-byte data is supported by LightCast
 
     def __init__(self):
         """
@@ -229,6 +230,11 @@ def get_product_skills(self, text_data):
         Returns:
             dict: A dictionary containing details of all the skills.
         """
+
+        if text_data and len(text_data) > self.MAX_LIGHTCAST_DATA_SIZE:
+            # Truncate the text_data to 50,000 bytes since only 50,000-byte data is supported by LightCast
+            text_data = text_data[:self.MAX_LIGHTCAST_DATA_SIZE]
+
         data = {
             'text': text_data
         }
diff --git a/tests/emsi/test_client.py b/tests/emsi/test_client.py
@@ -5,6 +5,8 @@
 
 import logging
 from time import time
+from unittest import mock
+from faker import Faker
 
 import responses
 from pytest import raises
@@ -161,6 +163,21 @@ def test_get_product_skills(self):
 
         assert skills == SKILLS_EMSI_CLIENT_RESPONSE
 
+    def test_get_product_skills_large_text(self):
+        """
+        Validate that the behavior of client while fetching product skills for very large text.
+        """
+        api_response = mock.Mock()
+        api_response.json.return_value = SKILLS_EMSI_RESPONSE
+        self.client.is_token_expired = mock.Mock(return_value=False)
+        self.client.client = mock.MagicMock(post=mock.Mock(return_value=api_response))
+
+        max_data_size = self.client.MAX_LIGHTCAST_DATA_SIZE
+        skill_text_data = Faker().text(max_data_size + max_data_size * 0.1)
+        self.client.get_product_skills(skill_text_data)
+
+        assert len(self.client.client.post.call_args_list[0][1]['json']['text']) == max_data_size
+
     @mock_api_response(
         method=responses.POST,
         url=EMSISkillsApiClient.API_BASE_URL + '/extract',