@@ -1104,161 +1104,163 @@ def generate_name(cls, arguments_dict):
1104
1104
key = arguments_dict ['course_key_string' ]
1105
1105
return f'Broken link check of { key } '
1106
1106
1107
+ # -------------- Course optimizer functions ------------------
1107
1108
1108
- @shared_task (base = CourseLinkCheckTask , bind = True )
1109
- def check_broken_links (self , user_id , course_key_string , language ):
1109
+ def _validate_user (task , user_id , language ):
1110
+ """Validate if the user exists. Otherwise log error. """
1111
+ try :
1112
+ return User .objects .get (pk = user_id )
1113
+ except User .DoesNotExist as exc :
1114
+ with translation_language (language ):
1115
+ task .status .fail (UserErrors .UNKNOWN_USER_ID .format (user_id ))
1116
+ return
1117
+
1118
+ def _get_urls (content ):
1110
1119
"""
1111
- Checks for broken links in a course. Store the results in a file.
1120
+ Returns all urls found after href and src in content.
1121
+ Excludes urls that are only '#'.
1112
1122
"""
1113
- def validate_user ():
1114
- """Validate if the user exists. Otherwise log error. """
1115
- try :
1116
- return User .objects .get (pk = user_id )
1117
- except User .DoesNotExist as exc :
1118
- with translation_language (language ):
1119
- self .status .fail (UserErrors .UNKNOWN_USER_ID .format (user_id ))
1120
- return
1123
+ regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
1124
+ url_list = re .findall (regex , content )
1125
+ return url_list
1121
1126
1122
- def get_urls (content ):
1123
- """
1124
- Returns all urls found after href and src in content.
1125
- Excludes urls that are only '#'.
1126
- """
1127
- regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
1128
- url_list = re .findall (regex , content )
1129
- return url_list
1130
-
1131
- def is_studio_url (url ):
1132
- """Returns True if url is a studio url."""
1133
- return not url .startswith ('http://' ) and not url .startswith ('https://' )
1134
-
1135
- def convert_to_standard_url (url , course_key ):
1136
- """
1137
- Returns standard urls when given studio urls. Otherwise return url as is.
1138
- Example urls:
1139
- /assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png
1140
- /static/getting-started_x250.png
1141
- /container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7
1142
- """
1143
- if is_studio_url (url ):
1144
- if url .startswith ('/static/' ):
1145
- processed_url = replace_static_urls (f'\" { url } \" ' , course_id = course_key )[1 :- 1 ]
1146
- return 'http://' + settings .CMS_BASE + processed_url
1147
- elif url .startswith ('/' ):
1148
- return 'http://' + settings .CMS_BASE + url
1149
- else :
1150
- return 'http://' + settings .CMS_BASE + '/container/' + url
1127
+ def _is_studio_url (url ):
1128
+ """Returns True if url is a studio url."""
1129
+ return not url .startswith ('http://' ) and not url .startswith ('https://' )
1130
+
1131
+ def _convert_to_standard_url (url , course_key ):
1132
+ """
1133
+ Returns standard urls when given studio urls. Otherwise return url as is.
1134
+ Example urls:
1135
+ /assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png
1136
+ /static/getting-started_x250.png
1137
+ /container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7
1138
+ """
1139
+ if _is_studio_url (url ):
1140
+ if url .startswith ('/static/' ):
1141
+ processed_url = replace_static_urls (f'\" { url } \" ' , course_id = course_key )[1 :- 1 ]
1142
+ return 'http://' + settings .CMS_BASE + processed_url
1143
+ elif url .startswith ('/' ):
1144
+ return 'http://' + settings .CMS_BASE + url
1151
1145
else :
1152
- return url
1146
+ return 'http://' + settings .CMS_BASE + '/container/' + url
1147
+ else :
1148
+ return url
1153
1149
1154
- def scan_course_for_links (course_key ):
1155
- """
1156
- Returns a list of all urls in a course.
1157
- Returns: [ [block_id1, url1], [block_id2, url2], ... ]
1158
- """
1159
- verticals = modulestore ().get_items (course_key , qualifiers = {'category' : 'vertical' }, revision = ModuleStoreEnum .RevisionOption .published_only )
1160
- blocks = []
1161
- urls_to_validate = []
1150
+ def _scan_course_for_links (course_key ):
1151
+ """
1152
+ Returns a list of all urls in a course.
1153
+ Returns: [ [block_id1, url1], [block_id2, url2], ... ]
1154
+ """
1155
+ verticals = modulestore ().get_items (course_key , qualifiers = {'category' : 'vertical' },
1156
+ revision = ModuleStoreEnum .RevisionOption .published_only )
1157
+ blocks = []
1158
+ urls_to_validate = []
1162
1159
1163
- for vertical in verticals :
1164
- blocks .extend (vertical .get_children ())
1160
+ for vertical in verticals :
1161
+ blocks .extend (vertical .get_children ())
1165
1162
1166
- for block in blocks :
1167
- block_id = str (block .usage_key )
1168
- block_info = get_block_info (block )
1169
- block_data = block_info ['data' ]
1163
+ for block in blocks :
1164
+ block_id = str (block .usage_key )
1165
+ block_info = get_block_info (block )
1166
+ block_data = block_info ['data' ]
1170
1167
1171
- url_list = get_urls (block_data )
1172
- urls_to_validate += [[block_id , url ] for url in url_list ]
1168
+ url_list = _get_urls (block_data )
1169
+ urls_to_validate += [[block_id , url ] for url in url_list ]
1173
1170
1174
- return urls_to_validate
1171
+ return urls_to_validate
1175
1172
1176
- async def validate_url_access (session , url_data , course_key ):
1177
- """
1178
- Returns the status of a url request
1179
- Returns: {block_id1, url1, status}
1180
- """
1181
- block_id , url = url_data
1182
- result = {'block_id' : block_id , 'url' : url }
1183
- standardized_url = convert_to_standard_url (url , course_key )
1184
- try :
1185
- async with session .get (standardized_url , timeout = 5 ) as response :
1186
- result .update ({'status' : response .status })
1187
- except Exception as e :
1188
- result .update ({'status' : None })
1189
- LOGGER .debug (f'[Link Check] Request error when validating { url } : { str (e )} ' )
1190
- return result
1191
-
1192
- async def validate_urls_access_in_batches (url_list , course_key , batch_size = 100 ):
1193
- """
1194
- Returns the statuses of a list of url requests.
1195
- Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ]
1196
- """
1197
- responses = []
1198
- url_count = len (url_list )
1199
-
1200
- for i in range (0 , url_count , batch_size ):
1201
- batch = url_list [i :i + batch_size ]
1202
- async with aiohttp .ClientSession () as session :
1203
- tasks = [validate_url_access (session , url_data , course_key ) for url_data in batch ]
1204
- batch_results = await asyncio .gather (* tasks )
1205
- responses .extend (batch_results )
1206
- LOGGER .debug (f'[Link Check] request batch { i // batch_size + 1 } of { url_count // batch_size + 1 } ' )
1207
-
1208
- return responses
1209
-
1210
- def retry_validation (url_list , course_key , retry_count = 3 ):
1211
- """Retry urls that failed due to connection error."""
1212
- results = []
1213
- retry_list = url_list
1214
- for i in range (0 , retry_count ):
1215
- if retry_list :
1216
- LOGGER .debug (f'[Link Check] retry attempt #{ i + 1 } ' )
1217
- validated_url_list = asyncio .run (
1218
- validate_urls_access_in_batches (retry_list , course_key , batch_size = 100 )
1219
- )
1220
- filetered_url_list , retry_list = filter_by_status (validated_url_list )
1221
- results .extend (filetered_url_list )
1173
+ async def _validate_url_access (session , url_data , course_key ):
1174
+ """
1175
+ Returns the status of a url request
1176
+ Returns: {block_id1, url1, status}
1177
+ """
1178
+ block_id , url = url_data
1179
+ result = {'block_id' : block_id , 'url' : url }
1180
+ standardized_url = _convert_to_standard_url (url , course_key )
1181
+ try :
1182
+ async with session .get (standardized_url , timeout = 5 ) as response :
1183
+ result .update ({'status' : response .status })
1184
+ except Exception as e :
1185
+ result .update ({'status' : None })
1186
+ LOGGER .debug (f'[Link Check] Request error when validating { url } : { str (e )} ' )
1187
+ return result
1188
+
1189
+ async def _validate_urls_access_in_batches (url_list , course_key , batch_size = 100 ):
1190
+ """
1191
+ Returns the statuses of a list of url requests.
1192
+ Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ]
1193
+ """
1194
+ responses = []
1195
+ url_count = len (url_list )
1196
+
1197
+ for i in range (0 , url_count , batch_size ):
1198
+ batch = url_list [i :i + batch_size ]
1199
+ async with aiohttp .ClientSession () as session :
1200
+ tasks = [_validate_url_access (session , url_data , course_key ) for url_data in batch ]
1201
+ batch_results = await asyncio .gather (* tasks )
1202
+ responses .extend (batch_results )
1203
+ LOGGER .debug (f'[Link Check] request batch { i // batch_size + 1 } of { url_count // batch_size + 1 } ' )
1204
+
1205
+ return responses
1206
+
1207
+ def _retry_validation (url_list , course_key , retry_count = 3 ):
1208
+ """Retry urls that failed due to connection error."""
1209
+ results = []
1210
+ retry_list = url_list
1211
+ for i in range (0 , retry_count ):
1212
+ if retry_list :
1213
+ LOGGER .debug (f'[Link Check] retry attempt #{ i + 1 } ' )
1214
+ validated_url_list = asyncio .run (
1215
+ _validate_urls_access_in_batches (retry_list , course_key , batch_size = 100 )
1216
+ )
1217
+ filetered_url_list , retry_list = _filter_by_status (validated_url_list )
1218
+ results .extend (filetered_url_list )
1222
1219
1223
- results .extend (retry_list )
1220
+ results .extend (retry_list )
1224
1221
1225
- return results
1222
+ return results
1226
1223
1227
- def filter_by_status (results ):
1228
- """
1229
- Filter results by status.
1230
- 200: OK. No need to do more
1231
- 403: Forbidden. Record as locked link.
1232
- None: Error. Retry up to 3 times.
1233
- Other: Failure. Record as broken link.
1234
- Returns:
1235
- filtered_results: [ [block_id1, url1, is_locked], ... ]
1236
- retry_list: [ [block_id1, url1], ... ]
1237
- """
1238
- filtered_results = []
1239
- retry_list = []
1240
- for result in results :
1241
- if result ['status' ] is None :
1242
- retry_list .append ([result ['block_id' ], result ['url' ]])
1243
- elif result ['status' ] == 200 :
1244
- continue
1245
- elif result ['status' ] == 403 and is_studio_url (result ['url' ]):
1246
- filtered_results .append ([result ['block_id' ], result ['url' ], True ])
1247
- else :
1248
- filtered_results .append ([result ['block_id' ], result ['url' ], False ])
1249
-
1250
- return filtered_results , retry_list
1224
+ def _filter_by_status (results ):
1225
+ """
1226
+ Filter results by status.
1227
+ 200: OK. No need to do more
1228
+ 403: Forbidden. Record as locked link.
1229
+ None: Error. Retry up to 3 times.
1230
+ Other: Failure. Record as broken link.
1231
+ Returns:
1232
+ filtered_results: [ [block_id1, url1, is_locked], ... ]
1233
+ retry_list: [ [block_id1, url1], ... ]
1234
+ """
1235
+ filtered_results = []
1236
+ retry_list = []
1237
+ for result in results :
1238
+ if result ['status' ] is None :
1239
+ retry_list .append ([result ['block_id' ], result ['url' ]])
1240
+ elif result ['status' ] == 200 :
1241
+ continue
1242
+ elif result ['status' ] == 403 and _is_studio_url (result ['url' ]):
1243
+ filtered_results .append ([result ['block_id' ], result ['url' ], True ])
1244
+ else :
1245
+ filtered_results .append ([result ['block_id' ], result ['url' ], False ])
1251
1246
1252
- user = validate_user ()
1247
+ return filtered_results , retry_list
1248
+
1249
+ @shared_task (base = CourseLinkCheckTask , bind = True )
1250
+ def check_broken_links (self , user_id , course_key_string , language ):
1251
+ """
1252
+ Checks for broken links in a course. Store the results in a file.
1253
+ """
1254
+ user = _validate_user (self , user_id , language )
1253
1255
1254
1256
self .status .set_state ('Scanning' )
1255
1257
course_key = CourseKey .from_string (course_key_string )
1256
- url_list = scan_course_for_links (course_key )
1257
- validated_url_list = asyncio .run (validate_urls_access_in_batches (url_list , course_key , batch_size = 100 ))
1258
- broken_or_locked_urls , retry_list = filter_by_status (validated_url_list )
1259
-
1258
+ url_list = _scan_course_for_links (course_key )
1259
+ validated_url_list = asyncio .run (_validate_urls_access_in_batches (url_list , course_key , batch_size = 100 ))
1260
+ broken_or_locked_urls , retry_list = _filter_by_status (validated_url_list )
1261
+
1260
1262
if retry_list :
1261
- retry_results = retry_validation (retry_list , course_key , retry_count = 3 )
1263
+ retry_results = _retry_validation (retry_list , course_key , retry_count = 3 )
1262
1264
broken_or_locked_urls .extend (retry_results )
1263
1265
1264
1266
try :
@@ -1274,7 +1276,7 @@ def filter_by_status(results):
1274
1276
artifact = UserTaskArtifact (status = self .status , name = 'BrokenLinks' )
1275
1277
artifact .file .save (name = os .path .basename (broken_links_file .name ), content = File (broken_links_file ))
1276
1278
artifact .save ()
1277
-
1279
+
1278
1280
# catch all exceptions so we can record useful error messages
1279
1281
except Exception as e : # pylint: disable=broad-except
1280
1282
LOGGER .exception ('Error checking links for course %s' , course_key , exc_info = True )
0 commit comments