Skip to content

Commit 51176cb

Browse files
author
Bernard Szabo
committed
feat: TNL-11812 no nested course optimizer functions
for unit testing ease
1 parent 067e1b0 commit 51176cb

File tree

1 file changed

+139
-137
lines changed

1 file changed

+139
-137
lines changed

cms/djangoapps/contentstore/tasks.py

+139-137
Original file line numberDiff line numberDiff line change
@@ -1104,161 +1104,163 @@ def generate_name(cls, arguments_dict):
11041104
key = arguments_dict['course_key_string']
11051105
return f'Broken link check of {key}'
11061106

1107+
# -------------- Course optimizer functions ------------------
11071108

1108-
@shared_task(base=CourseLinkCheckTask, bind=True)
1109-
def check_broken_links(self, user_id, course_key_string, language):
1109+
def _validate_user(task, user_id, language):
1110+
"""Validate if the user exists. Otherwise log error. """
1111+
try:
1112+
return User.objects.get(pk=user_id)
1113+
except User.DoesNotExist as exc:
1114+
with translation_language(language):
1115+
task.status.fail(UserErrors.UNKNOWN_USER_ID.format(user_id))
1116+
return
1117+
1118+
def _get_urls(content):
11101119
"""
1111-
Checks for broken links in a course. Store the results in a file.
1120+
Returns all urls found after href and src in content.
1121+
Excludes urls that are only '#'.
11121122
"""
1113-
def validate_user():
1114-
"""Validate if the user exists. Otherwise log error. """
1115-
try:
1116-
return User.objects.get(pk=user_id)
1117-
except User.DoesNotExist as exc:
1118-
with translation_language(language):
1119-
self.status.fail(UserErrors.UNKNOWN_USER_ID.format(user_id))
1120-
return
1123+
regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
1124+
url_list = re.findall(regex, content)
1125+
return url_list
11211126

1122-
def get_urls(content):
1123-
"""
1124-
Returns all urls found after href and src in content.
1125-
Excludes urls that are only '#'.
1126-
"""
1127-
regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
1128-
url_list = re.findall(regex, content)
1129-
return url_list
1130-
1131-
def is_studio_url(url):
1132-
"""Returns True if url is a studio url."""
1133-
return not url.startswith('http://') and not url.startswith('https://')
1134-
1135-
def convert_to_standard_url(url, course_key):
1136-
"""
1137-
Returns standard urls when given studio urls. Otherwise return url as is.
1138-
Example urls:
1139-
/assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png
1140-
/static/getting-started_x250.png
1141-
/container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7
1142-
"""
1143-
if is_studio_url(url):
1144-
if url.startswith('/static/'):
1145-
processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1]
1146-
return 'http://' + settings.CMS_BASE + processed_url
1147-
elif url.startswith('/'):
1148-
return 'http://' + settings.CMS_BASE + url
1149-
else:
1150-
return 'http://' + settings.CMS_BASE + '/container/' + url
1127+
def _is_studio_url(url):
1128+
"""Returns True if url is a studio url."""
1129+
return not url.startswith('http://') and not url.startswith('https://')
1130+
1131+
def _convert_to_standard_url(url, course_key):
1132+
"""
1133+
Returns standard urls when given studio urls. Otherwise return url as is.
1134+
Example urls:
1135+
/assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png
1136+
/static/getting-started_x250.png
1137+
/container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7
1138+
"""
1139+
if _is_studio_url(url):
1140+
if url.startswith('/static/'):
1141+
processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1]
1142+
return 'http://' + settings.CMS_BASE + processed_url
1143+
elif url.startswith('/'):
1144+
return 'http://' + settings.CMS_BASE + url
11511145
else:
1152-
return url
1146+
return 'http://' + settings.CMS_BASE + '/container/' + url
1147+
else:
1148+
return url
11531149

1154-
def scan_course_for_links(course_key):
1155-
"""
1156-
Returns a list of all urls in a course.
1157-
Returns: [ [block_id1, url1], [block_id2, url2], ... ]
1158-
"""
1159-
verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only)
1160-
blocks = []
1161-
urls_to_validate = []
1150+
def _scan_course_for_links(course_key):
1151+
"""
1152+
Returns a list of all urls in a course.
1153+
Returns: [ [block_id1, url1], [block_id2, url2], ... ]
1154+
"""
1155+
verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'},
1156+
revision=ModuleStoreEnum.RevisionOption.published_only)
1157+
blocks = []
1158+
urls_to_validate = []
11621159

1163-
for vertical in verticals:
1164-
blocks.extend(vertical.get_children())
1160+
for vertical in verticals:
1161+
blocks.extend(vertical.get_children())
11651162

1166-
for block in blocks:
1167-
block_id = str(block.usage_key)
1168-
block_info = get_block_info(block)
1169-
block_data = block_info['data']
1163+
for block in blocks:
1164+
block_id = str(block.usage_key)
1165+
block_info = get_block_info(block)
1166+
block_data = block_info['data']
11701167

1171-
url_list = get_urls(block_data)
1172-
urls_to_validate += [[block_id, url] for url in url_list]
1168+
url_list = _get_urls(block_data)
1169+
urls_to_validate += [[block_id, url] for url in url_list]
11731170

1174-
return urls_to_validate
1171+
return urls_to_validate
11751172

1176-
async def validate_url_access(session, url_data, course_key):
1177-
"""
1178-
Returns the status of a url request
1179-
Returns: {block_id1, url1, status}
1180-
"""
1181-
block_id, url = url_data
1182-
result = {'block_id': block_id, 'url': url}
1183-
standardized_url = convert_to_standard_url(url, course_key)
1184-
try:
1185-
async with session.get(standardized_url, timeout=5) as response:
1186-
result.update({'status': response.status})
1187-
except Exception as e:
1188-
result.update({'status': None})
1189-
LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}')
1190-
return result
1191-
1192-
async def validate_urls_access_in_batches(url_list, course_key, batch_size=100):
1193-
"""
1194-
Returns the statuses of a list of url requests.
1195-
Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ]
1196-
"""
1197-
responses = []
1198-
url_count = len(url_list)
1199-
1200-
for i in range(0, url_count, batch_size):
1201-
batch = url_list[i:i + batch_size]
1202-
async with aiohttp.ClientSession() as session:
1203-
tasks = [validate_url_access(session, url_data, course_key) for url_data in batch]
1204-
batch_results = await asyncio.gather(*tasks)
1205-
responses.extend(batch_results)
1206-
LOGGER.debug(f'[Link Check] request batch {i // batch_size+1} of {url_count // batch_size + 1}')
1207-
1208-
return responses
1209-
1210-
def retry_validation(url_list, course_key, retry_count=3):
1211-
"""Retry urls that failed due to connection error."""
1212-
results = []
1213-
retry_list = url_list
1214-
for i in range(0, retry_count):
1215-
if retry_list:
1216-
LOGGER.debug(f'[Link Check] retry attempt #{i+1}')
1217-
validated_url_list = asyncio.run(
1218-
validate_urls_access_in_batches(retry_list, course_key, batch_size=100)
1219-
)
1220-
filetered_url_list, retry_list = filter_by_status(validated_url_list)
1221-
results.extend(filetered_url_list)
1173+
async def _validate_url_access(session, url_data, course_key):
1174+
"""
1175+
Returns the status of a url request
1176+
Returns: {block_id1, url1, status}
1177+
"""
1178+
block_id, url = url_data
1179+
result = {'block_id': block_id, 'url': url}
1180+
standardized_url = _convert_to_standard_url(url, course_key)
1181+
try:
1182+
async with session.get(standardized_url, timeout=5) as response:
1183+
result.update({'status': response.status})
1184+
except Exception as e:
1185+
result.update({'status': None})
1186+
LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}')
1187+
return result
1188+
1189+
async def _validate_urls_access_in_batches(url_list, course_key, batch_size=100):
1190+
"""
1191+
Returns the statuses of a list of url requests.
1192+
Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ]
1193+
"""
1194+
responses = []
1195+
url_count = len(url_list)
1196+
1197+
for i in range(0, url_count, batch_size):
1198+
batch = url_list[i:i + batch_size]
1199+
async with aiohttp.ClientSession() as session:
1200+
tasks = [_validate_url_access(session, url_data, course_key) for url_data in batch]
1201+
batch_results = await asyncio.gather(*tasks)
1202+
responses.extend(batch_results)
1203+
LOGGER.debug(f'[Link Check] request batch {i // batch_size + 1} of {url_count // batch_size + 1}')
1204+
1205+
return responses
1206+
1207+
def _retry_validation(url_list, course_key, retry_count=3):
1208+
"""Retry urls that failed due to connection error."""
1209+
results = []
1210+
retry_list = url_list
1211+
for i in range(0, retry_count):
1212+
if retry_list:
1213+
LOGGER.debug(f'[Link Check] retry attempt #{i + 1}')
1214+
validated_url_list = asyncio.run(
1215+
_validate_urls_access_in_batches(retry_list, course_key, batch_size=100)
1216+
)
1217+
filetered_url_list, retry_list = _filter_by_status(validated_url_list)
1218+
results.extend(filetered_url_list)
12221219

1223-
results.extend(retry_list)
1220+
results.extend(retry_list)
12241221

1225-
return results
1222+
return results
12261223

1227-
def filter_by_status(results):
1228-
"""
1229-
Filter results by status.
1230-
200: OK. No need to do more
1231-
403: Forbidden. Record as locked link.
1232-
None: Error. Retry up to 3 times.
1233-
Other: Failure. Record as broken link.
1234-
Returns:
1235-
filtered_results: [ [block_id1, url1, is_locked], ... ]
1236-
retry_list: [ [block_id1, url1], ... ]
1237-
"""
1238-
filtered_results = []
1239-
retry_list = []
1240-
for result in results:
1241-
if result['status'] is None:
1242-
retry_list.append([result['block_id'], result['url']])
1243-
elif result['status'] == 200:
1244-
continue
1245-
elif result['status'] == 403 and is_studio_url(result['url']):
1246-
filtered_results.append([result['block_id'], result['url'], True])
1247-
else:
1248-
filtered_results.append([result['block_id'], result['url'], False])
1249-
1250-
return filtered_results, retry_list
1224+
def _filter_by_status(results):
1225+
"""
1226+
Filter results by status.
1227+
200: OK. No need to do more
1228+
403: Forbidden. Record as locked link.
1229+
None: Error. Retry up to 3 times.
1230+
Other: Failure. Record as broken link.
1231+
Returns:
1232+
filtered_results: [ [block_id1, url1, is_locked], ... ]
1233+
retry_list: [ [block_id1, url1], ... ]
1234+
"""
1235+
filtered_results = []
1236+
retry_list = []
1237+
for result in results:
1238+
if result['status'] is None:
1239+
retry_list.append([result['block_id'], result['url']])
1240+
elif result['status'] == 200:
1241+
continue
1242+
elif result['status'] == 403 and _is_studio_url(result['url']):
1243+
filtered_results.append([result['block_id'], result['url'], True])
1244+
else:
1245+
filtered_results.append([result['block_id'], result['url'], False])
12511246

1252-
user = validate_user()
1247+
return filtered_results, retry_list
1248+
1249+
@shared_task(base=CourseLinkCheckTask, bind=True)
1250+
def check_broken_links(self, user_id, course_key_string, language):
1251+
"""
1252+
Checks for broken links in a course. Store the results in a file.
1253+
"""
1254+
user = _validate_user(self, user_id, language)
12531255

12541256
self.status.set_state('Scanning')
12551257
course_key = CourseKey.from_string(course_key_string)
1256-
url_list = scan_course_for_links(course_key)
1257-
validated_url_list = asyncio.run(validate_urls_access_in_batches(url_list, course_key, batch_size=100))
1258-
broken_or_locked_urls, retry_list = filter_by_status(validated_url_list)
1259-
1258+
url_list = _scan_course_for_links(course_key)
1259+
validated_url_list = asyncio.run(_validate_urls_access_in_batches(url_list, course_key, batch_size=100))
1260+
broken_or_locked_urls, retry_list = _filter_by_status(validated_url_list)
1261+
12601262
if retry_list:
1261-
retry_results = retry_validation(retry_list, course_key, retry_count=3)
1263+
retry_results = _retry_validation(retry_list, course_key, retry_count=3)
12621264
broken_or_locked_urls.extend(retry_results)
12631265

12641266
try:
@@ -1274,7 +1276,7 @@ def filter_by_status(results):
12741276
artifact = UserTaskArtifact(status=self.status, name='BrokenLinks')
12751277
artifact.file.save(name=os.path.basename(broken_links_file.name), content=File(broken_links_file))
12761278
artifact.save()
1277-
1279+
12781280
# catch all exceptions so we can record useful error messages
12791281
except Exception as e: # pylint: disable=broad-except
12801282
LOGGER.exception('Error checking links for course %s', course_key, exc_info=True)

0 commit comments

Comments
 (0)