From 6f9f504639927a0891da3757a25cda3c92a17144 Mon Sep 17 00:00:00 2001 From: b04505009 Date: Tue, 8 Dec 2020 23:49:50 +0800 Subject: [PATCH 1/4] use pathvalidate to make the local file path is valid --- ceiba_dl/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) mode change 100644 => 100755 ceiba_dl/__init__.py diff --git a/ceiba_dl/__init__.py b/ceiba_dl/__init__.py old mode 100644 new mode 100755 index c7f00cb..8029b27 --- a/ceiba_dl/__init__.py +++ b/ceiba_dl/__init__.py @@ -10,6 +10,8 @@ import pycurl import urllib.parse +from pathvalidate import sanitize_filepath + class Error(Exception): def __str__(self): return self.message @@ -237,6 +239,7 @@ def download_file(self, path, retry, dcb, ecb): def download_link(self, path, node, retry, dcb, ecb): disk_path_object = pathlib.Path(path.lstrip('/')) + disk_path_object = pathlib.Path(sanitize_filepath(str(disk_path_object))) disk_path = str(disk_path_object) if self.vfs.is_internal_link(node): link_target_path = str(pathlib.PurePath(node.read_link())) @@ -283,7 +286,8 @@ def download_link(self, path, node, retry, dcb, ecb): def download_regular(self, path, node, retry, dcb, ecb): disk_path_object = pathlib.Path(path.lstrip('/')) - + disk_path_object = pathlib.Path(sanitize_filepath(str(disk_path_object))) + def ccb(*args): return dcb(path, *args) @@ -355,6 +359,7 @@ def disk_path_object_open(mode): def download_directory(self, path, node, retry, dcb, ecb): disk_path_object = pathlib.Path(path.lstrip('/')) + disk_path_object = pathlib.Path(sanitize_filepath(str(disk_path_object))) if disk_path_object.is_dir(): self.logger.info('跳過已經存在的資料夾 {}' \ .format(str(disk_path_object))) From 4728d906a332d8195104d0b5e21b83df1e8d8495 Mon Sep 17 00:00:00 2001 From: b04505009 Date: Tue, 8 Dec 2020 23:51:09 +0800 Subject: [PATCH 2/4] =?UTF-8?q?add=20another=20possible=20attribute=20'?= =?UTF-8?q?=E5=85=AC=E5=B8=83=E5=85=A8=E7=8F=AD'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ceiba_dl/config.py | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 ceiba_dl/config.py diff --git a/ceiba_dl/config.py b/ceiba_dl/config.py old mode 100644 new mode 100755 index 4d8491e..bf6e424 --- a/ceiba_dl/config.py +++ b/ceiba_dl/config.py @@ -90,6 +90,7 @@ class Config: 'attr_course_grades_show': '成績公布', 'value_course_grades_show_n': '不公布', 'value_course_grades_show_p': '公布個人', + 'value_course_grades_show_a': '公布全班', 'attr_course_grades_is_changed': 'is_changed', 'dir_course_homeworks': '作業區', 'file_course_homeworks_homework': '作業內容.json', From 5383f8cbc4da72a6065161a2cd4502cfbf3ea761 Mon Sep 17 00:00:00 2001 From: b04505009 Date: Wed, 9 Dec 2020 00:56:53 +0800 Subject: [PATCH 3/4] =?UTF-8?q?add=20support=20for=20=E6=97=81=E8=81=BDcou?= =?UTF-8?q?rses,=20=E6=9A=91=E6=9C=9F=E5=AF=A6=E7=BF=92timeslots,=20?= =?UTF-8?q?=E5=AD=B8=E7=94=9Ffrom=20NTUST=20or=20NTNU,=20=E6=88=90?= =?UTF-8?q?=E7=B8=BE=E5=85=AC=E4=BD=88with=20'=E5=85=AC=E4=BD=88=E5=85=A8?= =?UTF-8?q?=E7=8F=AD'=20attribute,=20and=20comment=20out=20some=20assertio?= =?UTF-8?q?ns=20base=20on=20my=20course=20list?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ceiba_dl/vfs.py | 242 +++++++++++++++++++++++++++--------------------- 1 file changed, 134 insertions(+), 108 deletions(-) mode change 100644 => 100755 ceiba_dl/vfs.py diff --git a/ceiba_dl/vfs.py b/ceiba_dl/vfs.py old mode 100644 new mode 100755 index a5c6c59..1e849b1 --- a/ceiba_dl/vfs.py +++ b/ceiba_dl/vfs.py @@ -353,6 +353,8 @@ def _create_course_list_map(self): course_list_page = self.vfs.request.web('/student/index.php') course_list_rows_all = course_list_page.xpath('//table[1]/tr') course_list_rows = course_list_rows_all[1:] + # Add 旁聽 courses + course_list_rows += course_list_page.xpath('//table[2]/tr')[1:] course_list_header_row = course_list_rows_all[0] assert len(course_list_header_row) == 8 @@ -641,144 +643,160 @@ def fetch(self): assert len(student_page.xpath('//table')) > 0 student_rows = student_page.xpath('//div[@id="sect_cont"]/table/tr') - assert len(student_rows) == 12 + # NTNU and NTUST students may have less rows + #assert len(student_rows) == 12 student_file = JSONFile(self.vfs, self) student_filename = '{}.json'.format(self._account, sn) self.add(student_filename, student_file) # 身份 - student_role = row_get_value(student_rows[0], - ['身份', 'Role'], {}, free_form=True).strip() - student_file.add(s['attr_students_role'], student_role, student_path) + if len(student_rows) > 0: + student_role = row_get_value(student_rows[0], + ['身份', 'Role'], {}, free_form=True).strip() + student_file.add(s['attr_students_role'], student_role, student_path) # 照片 - student_photo_element = row_get_value(student_rows[1], - ['照片', 'Photo'], {}, free_form=True, return_object=True) - if len(student_photo_element) > 0: - assert len(student_photo_element) == 1 - assert student_photo_element[0].tag == 'img' - assert student_photo_element[0].get('src') - student_photo = student_photo_element[0].get('src') \ - .rsplit('/', maxsplit=1)[1] - student_photo_path = url_to_path_and_args( - student_photo_element[0].get('src'), no_query_string=True)[0] - self.add(student_photo, DownloadFile(self.vfs, self, - student_photo_path)) - else: - student_photo = '' - student_file.add(s['attr_students_photo'], student_photo, student_path) + if len(student_rows) > 1: + student_photo_element = row_get_value(student_rows[1], + ['照片', 'Photo'], {}, free_form=True, return_object=True) + if len(student_photo_element) > 0: + assert len(student_photo_element) == 1 + assert student_photo_element[0].tag == 'img' + assert student_photo_element[0].get('src') + student_photo = student_photo_element[0].get('src') \ + .rsplit('/', maxsplit=1)[1] + student_photo_path = url_to_path_and_args( + student_photo_element[0].get('src'), no_query_string=True)[0] + self.add(student_photo, DownloadFile(self.vfs, self, + student_photo_path)) + else: + student_photo = '' + student_file.add(s['attr_students_photo'], student_photo, student_path) # 姓名 - student_name = row_get_value(student_rows[2], - ['姓名', 'Name'], {}, free_form=True).strip() - student_file.add(s['attr_students_name'], student_name, student_path) + if len(student_rows) > 2: + student_name = row_get_value(student_rows[2], + ['姓名', 'Name'], {}, free_form=True).strip() + student_file.add(s['attr_students_name'], student_name, student_path) # 英文姓名 - student_english_name = row_get_value(student_rows[3], - ['英文姓名', 'English Name'], {}, free_form=True).strip() - student_file.add(s['attr_students_english_name'], - student_english_name, student_path) + if len(student_rows) > 3: + student_english_name = row_get_value(student_rows[3], + ['英文姓名', 'English Name'], {}, free_form=True).strip() + student_file.add(s['attr_students_english_name'], + student_english_name, student_path) # 匿名代號 - student_screen_name = row_get_value(student_rows[4], - ['匿名代號', 'Screen Name'], {}, free_form=True).strip() - student_file.add(s['attr_students_screen_name'], - student_screen_name, student_path) + if len(student_rows) > 4: + student_screen_name = row_get_value(student_rows[4], + ['匿名代號', 'Screen Name'], {}, free_form=True).strip() + student_file.add(s['attr_students_screen_name'], + student_screen_name, student_path) # 學校系級 - student_school_year = row_get_value(student_rows[5], - ['系級', 'Major & Year', '學校系級', 'School & Dept'], - {}, free_form=True).strip() - student_file.add(s['attr_students_school_year'], - student_school_year, student_path) + if len(student_rows) > 5: + student_school_year = row_get_value(student_rows[5], + ['系級', 'Major & Year', '學校系級', 'School & Dept'], + {}, free_form=True).strip() + student_file.add(s['attr_students_school_year'], + student_school_year, student_path) # 個人首頁網址 - student_homepage_url_element = row_get_value(student_rows[6], - ['個人首頁網址', 'Homepage URL'], {}, free_form=True, return_object=True) - assert len(student_homepage_url_element) == 1 - assert student_homepage_url_element[0].tag == 'a' - assert student_homepage_url_element[0].get('href') - student_homepage_url = element_get_text(student_homepage_url_element[0]) - assert student_homepage_url_element[0].get('href') == \ + if len(student_rows) > 6: + student_homepage_url_element = row_get_value(student_rows[6], + ['個人首頁網址', 'Homepage URL'], {}, free_form=True, return_object=True) + assert len(student_homepage_url_element) == 1 + assert student_homepage_url_element[0].tag == 'a' + assert student_homepage_url_element[0].get('href') + student_homepage_url = element_get_text(student_homepage_url_element[0]) + # Not sure what's this assertion for, but the program works fine without this assertion + """ + assert student_homepage_url_element[0].get('href') == \ student_homepage_url or \ student_homepage_url_element[0].get('href') == \ 'http://' + student_homepage_url - student_file.add(s['attr_students_homepage_url'], - student_homepage_url, student_path) + """ + student_file.add(s['attr_students_homepage_url'], + student_homepage_url, student_path) # 電子郵件 - student_email_address_element = row_get_value(student_rows[7], - ['電子郵件', 'Email Address'], {}, free_form=True, return_object=True) - assert len(student_email_address_element) == 1 - assert student_email_address_element[0].tag == 'a' - assert student_email_address_element[0].get('href') - student_email_address = element_get_text(student_email_address_element[0]) - if len(student_email_address_element[0]) == 0: - if student_email_address.find('"') < 0: - assert student_email_address_element[0].get('href') == \ - 'mailto:' + student_email_address - else: - self.vfs.logger.warning('學號 {} 的個人頁面電子郵件欄位有多餘的標籤' \ - .format(self._account)) - self.vfs.logger.warning('這很有可能是 CEIBA 沒有跳脫特殊字元所造成') - student_email_address_href = student_email_address_element[0].get('href') - assert student_email_address_href.startswith('mailto:') - if student_email_address_href.find('<') >= 7 and \ - student_email_address_href.find('>') >= 7: - student_email_address = student_email_address_href[7:] + if len(student_rows) > 7: + student_email_address_element = row_get_value(student_rows[7], + ['電子郵件', 'Email Address'], {}, free_form=True, return_object=True) + assert len(student_email_address_element) == 1 + assert student_email_address_element[0].tag == 'a' + assert student_email_address_element[0].get('href') + student_email_address = element_get_text(student_email_address_element[0]) + if len(student_email_address_element[0]) == 0: + if student_email_address.find('"') < 0: + assert student_email_address_element[0].get('href') == \ + 'mailto:' + student_email_address else: - assert student_email_address.find('"') >= 0 - student_file.add(s['attr_students_email_address'], - student_email_address, student_path) + self.vfs.logger.warning('學號 {} 的個人頁面電子郵件欄位有多餘的標籤' \ + .format(self._account)) + self.vfs.logger.warning('這很有可能是 CEIBA 沒有跳脫特殊字元所造成') + student_email_address_href = student_email_address_element[0].get('href') + assert student_email_address_href.startswith('mailto:') + if student_email_address_href.find('<') >= 7 and \ + student_email_address_href.find('>') >= 7: + student_email_address = student_email_address_href[7:] + else: + assert student_email_address.find('"') >= 0 + student_file.add(s['attr_students_email_address'], + student_email_address, student_path) # 常用電子郵件 - student_frequently_used_email_element = row_get_value(student_rows[8], - ['常用電子郵件', 'Frequently Used Email'], - {}, free_form=True, return_object=True) - assert len(student_frequently_used_email_element) == 1 - assert student_frequently_used_email_element[0].tag == 'a' - assert student_frequently_used_email_element[0].get('href') - student_frequently_used_email = element_get_text( - student_frequently_used_email_element[0]) - student_frequently_used_email_from_href = \ - student_frequently_used_email_element[0].get('href') - - # CEIBA 不會跳脫 < 和 > 符號,如果使用者填寫的電子郵件地址包含這個符號 - # 會使透過 .text 拿到的資料不正確 - if student_frequently_used_email_from_href.find('<') >= 0 and \ - student_frequently_used_email_from_href.find('>') >= 0: - assert student_frequently_used_email_from_href.startswith('mailto:') - student_frequently_used_email = \ - student_frequently_used_email_from_href[7:] - else: - assert student_frequently_used_email_from_href == \ - 'mailto:' + student_frequently_used_email + if len(student_rows) > 8: + student_frequently_used_email_element = row_get_value(student_rows[8], + ['常用電子郵件', 'Frequently Used Email'], + {}, free_form=True, return_object=True) + assert len(student_frequently_used_email_element) == 1 + assert student_frequently_used_email_element[0].tag == 'a' + assert student_frequently_used_email_element[0].get('href') + student_frequently_used_email = element_get_text( + student_frequently_used_email_element[0]) + student_frequently_used_email_from_href = \ + student_frequently_used_email_element[0].get('href') + + # CEIBA 不會跳脫 < 和 > 符號,如果使用者填寫的電子郵件地址包含這個符號 + # 會使透過 .text 拿到的資料不正確 + if student_frequently_used_email_from_href.find('<') >= 0 and \ + student_frequently_used_email_from_href.find('>') >= 0: + assert student_frequently_used_email_from_href.startswith('mailto:') + student_frequently_used_email = \ + student_frequently_used_email_from_href[7:] + else: + assert student_frequently_used_email_from_href == \ + 'mailto:' + student_frequently_used_email - student_file.add(s['attr_students_frequently_used_email'], - student_frequently_used_email, student_path) + student_file.add(s['attr_students_frequently_used_email'], + student_frequently_used_email, student_path) # 聯絡電話 - student_phone = row_get_value(student_rows[9], - ['聯絡電話', 'Phone'], {}, free_form=True).strip() - student_file.add(s['attr_students_phone'], student_phone, student_path) + if len(student_rows) > 9: + student_phone = row_get_value(student_rows[9], + ['聯絡電話', 'Phone'], {}, free_form=True).strip() + student_file.add(s['attr_students_phone'], student_phone, student_path) # 聯絡地址 - student_address = row_get_value(student_rows[10], - ['聯絡地址', 'Address'], {}, free_form=True).strip() - student_file.add(s['attr_students_address'], - student_address, student_path) + if len(student_rows) > 10: + student_address = row_get_value(student_rows[10], + ['聯絡地址', 'Address'], {}, free_form=True).strip() + student_file.add(s['attr_students_address'], + student_address, student_path) # 更多的個人資訊 - student_more_personal_information_element = row_get_value(student_rows[11], - ['更多的個人資訊', 'More Personal Information'], - {}, free_form=True, return_object=True) - - # 使用者可以自己在這個欄位塞各種標籤…… - student_more_personal_information = ''.join( - student_more_personal_information_element.itertext()) - student_file.add(s['attr_students_more_personal_information'], - student_more_personal_information, student_path) + if len(student_rows) > 11: + student_more_personal_information_element = row_get_value(student_rows[11], + ['更多的個人資訊', 'More Personal Information'], + {}, free_form=True, return_object=True) + + # 使用者可以自己在這個欄位塞各種標籤…… + student_more_personal_information = ''.join( + student_more_personal_information_element.itertext()) + student_file.add(s['attr_students_more_personal_information'], + student_more_personal_information, student_path) student_file.finish() self.ready = True @@ -795,7 +813,8 @@ def fetch(self): assert set(result.keys()) == set(result_keys) days = '一二三四五六日' - slots = '01234@56789XABCD' # 節次 (possible time slots: See https://nol.ntu.edu.tw/nol/guest/index.php for more information) + #slots = '01234@56789XABCD' # 節次 (possible time slots: See https://nol.ntu.edu.tw/nol/guest/index.php for more information) + slots = '01234@56789XABCD ' # For me, 暑期實習 has slot with space character courses = dict() class Course(dict): @@ -2064,6 +2083,10 @@ def fetch(self): grade_row_show = 'N' elif grade_row[7].text in ['公布個人', 'Individual']: grade_row_show = 'P' + elif len(grade_row[7].xpath('./a')) == 1 and \ + grade_row[7].xpath('./a')[0].text in ['公布全班', 'Everyone']: + # TODO: download everyone's grade from this link + grade_row_show = 'A' else: assert False @@ -2081,7 +2104,7 @@ def fetch(self): else: assert set(grade.keys()) - set(optional_sub_keys) == set(sub_keys) assert grade['grade_isranking'] in ['0', '1'] - assert grade['show'] in ['N', 'P'] + assert grade['show'] in ['N', 'P', 'A'] assert grade['is_changed'] in ['0', '1'] grade_item_filename += ' {:08}'.format(int(grade['main_sn'])) @@ -2240,6 +2263,8 @@ def fetch(self): show = s['value_course_grades_show_n'] elif grade_row_show == 'P': show = s['value_course_grades_show_p'] + elif grade_row_show == 'A': + show = s['value_course_grades_show_a'] else: assert False @@ -3121,7 +3146,8 @@ def __init__(self, vfs, parent, cell): def fetch(self): s = self.vfs.strings - assert not element_get_text(self._cell).strip() + # I have a course which only has it assistants' names in '課程助教' column with pure text + #assert not element_get_text(self._cell).strip() for child in self._cell: assert child.tag == 'a' or child.tag == 'br' From 58f5bf5153977bde2d6a94d2e5c4294888ce38fb Mon Sep 17 00:00:00 2001 From: b04505009 Date: Mon, 5 Apr 2021 18:50:33 +0800 Subject: [PATCH 4/4] Add assertion len(student_rows) <= 12 --- ceiba_dl/vfs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceiba_dl/vfs.py b/ceiba_dl/vfs.py index 1e849b1..38f330a 100755 --- a/ceiba_dl/vfs.py +++ b/ceiba_dl/vfs.py @@ -644,7 +644,7 @@ def fetch(self): student_rows = student_page.xpath('//div[@id="sect_cont"]/table/tr') # NTNU and NTUST students may have less rows - #assert len(student_rows) == 12 + assert len(student_rows) <= 12 student_file = JSONFile(self.vfs, self) student_filename = '{}.json'.format(self._account, sn)